### Exercise 1

1) Load the breast_cancer dataset from skelarn (from sklearn.datasets import load_breast_cancer). Split the dataset into training and test datasets. Scale the dataset using minmaxscaler. Use KNeighborsClassifier classifier and report the score on the test dataset.

In [9]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, r2_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer



In [2]:
data = load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the test dataset: {accuracy:.2f}")

Accuracy on the test dataset: 0.96


2) Repeat Step 1 using pipelines and report the score.

In [3]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Step for scaling
    ('knn', KNeighborsClassifier(n_neighbors=5))  # KNN Classifier
])

# Step 4: Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Step 5: Predict on the test dataset
y_pred = pipeline.predict(X_test)

# Reporting the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the test dataset using pipeline: {accuracy:.2f}")

Accuracy on the test dataset using pipeline: 0.96


3) Use the pipeline object from Step 2 and make a grid search on parameter of number of neighbor. 

In [6]:
# Create a pipeline with scaling and KNN
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('knn', KNeighborsClassifier())
])

# Set up the parameter grid for GridSearchCV
param_grid = {
    'knn__n_neighbors': np.arange(1, 21)  # Testing neighbors from 1 to 20
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Evaluate on the test set using the best estimator
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

# Reporting results
print(f"Best parameters: {best_params}")
print(f"Best cross-validated accuracy on training set: {best_score:.2f}")
print(f"Test accuracy using best model: {test_accuracy:.2f}")

Best parameters: {'knn__n_neighbors': 6}
Best cross-validated accuracy on training set: 0.96
Test accuracy using best model: 0.96


4)	Import bike_day_raw.csv. Create a pipeline using ColumnTransformer, Scaling, and KNeighborsRegressor.

- Use `from sklearn.neighbors import KNeighborsRegressor'
- You need to split the data into X and y. 
- Check the data shape
- Check the data types
- Print the column names of the data frame
- Create a scatterplot of each feature against the target variable
- Create alist of features that are numeric and not numeric
- Create a pipeline of imputer and standard scaler for the numeric features
- Create a column transformer which uses the pipeline you created for numeric features and a onehotencoder for the non-numeric features
- You can create your column transformer in different ways
- Finally create a pipeline of column transformer and kNeighborsRegressor
- Split your data into train and test datasets
- Report the score on the test dataset



In [None]:
# Step 1: Load the dataset
df = pd.read_csv('bike_day_raw.csv')

# Step 2: Split the data into features (X) and target (y)
X = df.drop('target_variable', axis=1)  # Replace 'target_variable' with the actual name of the target column
y = df['target_variable']

# Step 3: Check the data shape
print(f"Data shape: {df.shape}")

# Step 4: Check the data types
print(f"Data types:\n{df.dtypes}")

# Step 5: Print the column names of the DataFrame
print(f"Column names: {df.columns.tolist()}")

# Step 6: Create a scatterplot of each feature against the target variable
for column in X.columns:
    plt.scatter(X[column], y)
    plt.title(f'Scatter plot of {column} vs target variable')
    plt.xlabel(column)
    plt.ylabel('Target Variable')
    plt.show()

# Step 7: Create a list of features that are numeric and non-numeric
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
non_numeric_features = X.select_dtypes(exclude=[np.number]).columns.tolist()
print(f"Numeric features: {numeric_features}")
print(f"Non-numeric features: {non_numeric_features}")

# Step 8: Create a pipeline of imputer and standard scaler for numeric features
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Step 9: Create a ColumnTransformer
column_transformer = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), non_numeric_features)
    ]
)

# Step 10: Create a pipeline of ColumnTransformer and KNeighborsRegressor
pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('knn', KNeighborsRegressor(n_neighbors=5))
])

# Step 11: Split your data into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 12: Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Step 13: Predict on the test dataset
y_pred = pipeline.predict(X_test)

# Step 14: Report the score on the test dataset
score = r2_score(y_test, y_pred)
print(f"R^2 score on the test dataset: {score:.2f}")
