In [1]:
# General imports for data manipulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# scikit-learn imports for machine learning
from sklearn.datasets import fetch_california_housing, load_breast_cancer
from sklearn.model_selection import train_test_split

# Preprocessing and pipeline utilities
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Model imports
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Metrics and model evaluation
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score, \
                            mean_absolute_error, f1_score, precision_score, \
                            recall_score, roc_auc_score, confusion_matrix, \
                            roc_curve, auc

# Additional utilities
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier



## Exercise 1: MSE Scikit-learn


In [None]:
y_true = [91, 51, 2.5, 2, -5]
y_pred = [90, 48, 2, 2, -4]

mse = mean_squared_error(y_true, y_pred)

print("Mean Squared Error:", mse)


## Exercise 2: Accuracy Scikit-learn


In [None]:
y_pred = [0, 1, 0, 1, 0, 1, 0]
y_true = [0, 0, 1, 1, 1, 1, 0]

accuracy = accuracy_score(y_pred, y_true)

print("Accuracy:", accuracy)

## Exercise 3: Regression

In [None]:
# imports

# data
housing = fetch_california_housing()
X, y = housing['data'], housing['target']
# split data train test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    shuffle=True,
                                                    random_state=13)
# pipeline
pipeline = [('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('lr', LinearRegression())]


pipe = Pipeline(pipeline)
# fit
pipe.fit(X_train, y_train)


# Predictions on training set
y_train_pred = pipe.predict(X_train)
# Predictions on test set
y_test_pred = pipe.predict(X_test)


print("Predicted Train:", y_train_pred[:10])
print("")
print("Predicted Test:", y_test_pred[:10])

In [None]:
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)

# Compute metrics for test set
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print("R2 Train:", r2_train)
print("MSE Train:", mse_train)
print("MAE Train:", mae_train)
print("")
print("R2 Test:", r2_test)
print("MSE Test:", mse_test)
print("MAE Test:", mae_test)

## Exercise 4: Classification


In [None]:

#Load the breast cancer dataset
X , y = load_breast_cancer(return_X_y=True)

#Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=43)

#Standardize the data
scaler = StandardScaler()

#Fit the scaler to the training data and transform the training data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

classifier = LogisticRegression()

#Fit the classifier to the scaled training data
classifier.fit(X_train_scaled, y_train)

In [None]:
y_train_pred = classifier.predict(X_train_scaled)

# Predict on the test set
y_test_pred = classifier.predict(X_test_scaled)

print("Predicted Train:", y_train_pred[:10])
print("")
print("Predicted Test:", y_test_pred[:10])

In [None]:


# Compute metrics for the training set
print("Training Set Metrics:")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred)}")
print(f"Precision: {precision_score(y_train, y_train_pred)}")
print(f"Recall: {recall_score(y_train, y_train_pred)}")
print(f"F1 Score: {f1_score(y_train, y_train_pred)}")

# Compute AUC on the training set - need to use predict_proba to get probabilities
y_train_proba = classifier.predict_proba(X_train_scaled)[:, 1]  # Probabilities of the positive class
print(f"ROC AUC: {roc_auc_score(y_train, y_train_proba)}")

# Compute metrics for the test set
print("\nTest Set Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Precision: {precision_score(y_test, y_test_pred)}")
print(f"Recall: {recall_score(y_test, y_test_pred)}")
print(f"F1 Score: {f1_score(y_test, y_test_pred)}")

# Compute AUC on the test set - need to use predict_proba to get probabilities
y_test_proba = classifier.predict_proba(X_test_scaled)[:, 1]  # Probabilities of the positive class
print(f"ROC AUC: {roc_auc_score(y_test, y_test_proba)}")

# Print the confusion matrix for the test set
print("\nConfusion Matrix for the Test Set:")
print(confusion_matrix(y_test, y_test_pred))


In [None]:
fpr, tpr, _ = roc_curve(y_test, y_test_proba)
roc_auc = auc(fpr, tpr)

# Generate the plot
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='blue', lw=lw, label='Pipeline (AUC = %0.2f)' % roc_auc)
padding = 0.02  # This is the padding value. Adjust it to add more or less padding.
plt.xlim([0.0 - padding, 1.0 + padding])
plt.ylim([0.0 - padding, 1.05 + padding])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()


## Exercise 5: Machine Learning models


In [2]:
# Fetching the dataset
housing = fetch_california_housing()
X, y = housing['data'], housing['target']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=43)

# Preprocessing steps remain constant
preprocessing_steps = [('imputer', SimpleImputer(strategy='median')),
                       ('scaler', StandardScaler())]

# Models to be tested
models = [
    ('Linear Regression', LinearRegression()),
    ('SVM', SVR()),
    ('Decision Tree', DecisionTreeRegressor(random_state=43)),
    ('Random Forest', RandomForestRegressor(random_state=43)),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=43))
]

for name, model in models:
    pipeline = Pipeline(preprocessing_steps + [(name, model)])
    pipeline.fit(X_train, y_train)
    
    # Making predictions
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    
    # Calculating metrics
    print(f"{name}:")
    print(f"Train R^2: {metrics.r2_score(y_train, y_train_pred)}")
    print(f"Test R^2: {metrics.r2_score(y_test, y_test_pred)}")
    print(f"Train MSE: {metrics.mean_squared_error(y_train, y_train_pred)}")
    print(f"Test MSE: {metrics.mean_squared_error(y_test, y_test_pred)}")
    print(f"Train MAE: {metrics.mean_absolute_error(y_train, y_train_pred)}")
    print(f"Test MAE: {metrics.mean_absolute_error(y_test, y_test_pred)}")
    print("-" * 40)

    #R2 is how well the X features are to the y 

Linear Regression:
Train R^2: 0.6054131599242079
Test R^2: 0.6128959462132961
Train MSE: 0.5273648371379568
Test MSE: 0.49761195027083827
Train MAE: 0.5330920012614553
Test MAE: 0.5196420310323718
----------------------------------------
SVM:
Train R^2: 0.7496108582936591
Test R^2: 0.7295080649899655
Train MSE: 0.3346447867133981
Test MSE: 0.3477101776543043
Train MAE: 0.3835645163325976
Test MAE: 0.3897680598426732
----------------------------------------
Decision Tree:
Train R^2: 1.0
Test R^2: 0.6411350449487532
Train MSE: 9.287461238793889e-32
Test MSE: 0.4613113410207364
Train MAE: 4.212344895326283e-17
Test MAE: 0.4339228100775194
----------------------------------------
Random Forest:
Train R^2: 0.9741424383160557
Test R^2: 0.8127465331519781
Train MSE: 0.034558600088180014
Test MSE: 0.24070934396507854
Train MAE: 0.11989028276808825
Test MAE: 0.31935835024224823
----------------------------------------
Gradient Boosting:
Train R^2: 0.8042086499063386
Test R^2: 0.7895081234643192

## Exercise 6: Grid Search

In [19]:
# Load the California housing dataset
housing = fetch_california_housing()
X, y = housing['data'], housing['target']

# Define the parameter grid
parameters = {'n_estimators':[10, 50, 75],
            'max_depth':[3,5,7],
            'min_samples_leaf': [10,20,30]}

rf = RandomForestRegressor()
gridsearch = GridSearchCV(rf,
                        parameters,
                        cv = [(np.arange(18576), np.arange(18576,20640))],
                        n_jobs=-1,
                        verbose=2
                        )
gridsearch.fit(X, y)


print("Best estimator:", gridsearch.best_estimator_)
print("Best parameters:", gridsearch.best_params_)
print("Best score:", gridsearch.best_score_)

Fitting 1 folds for each of 27 candidates, totalling 27 fits
Best estimator: RandomForestRegressor(max_depth=7, min_samples_leaf=10, n_estimators=75)
Best parameters: {'max_depth': 7, 'min_samples_leaf': 10, 'n_estimators': 75}
Best score: 0.6303702349922858
[CV] END ..max_depth=3, min_samples_leaf=30, n_estimators=50; total time=   1.5s
[CV] END ..max_depth=5, min_samples_leaf=20, n_estimators=75; total time=   3.4s
[CV] END ..max_depth=3, min_samples_leaf=10, n_estimators=10; total time=   0.3s
[CV] END ..max_depth=3, min_samples_leaf=30, n_estimators=75; total time=   2.1s
[CV] END ..max_depth=7, min_samples_leaf=10, n_estimators=50; total time=   3.0s
[CV] END ..max_depth=3, min_samples_leaf=20, n_estimators=75; total time=   2.3s
[CV] END ..max_depth=5, min_samples_leaf=30, n_estimators=75; total time=   3.2s
[CV] END ..max_depth=3, min_samples_leaf=10, n_estimators=50; total time=   1.5s
[CV] END ..max_depth=5, min_samples_leaf=20, n_estimators=10; total time=   0.5s
[CV] END ..m

In [16]:
def select_model_verbose(gs):
    return gs.best_estimator_, gs.best_params_, gs.best_score_

print(select_model_verbose(gridsearch))

(RandomForestRegressor(max_depth=7, min_samples_leaf=20, n_estimators=75), {'max_depth': 7, 'min_samples_leaf': 20, 'n_estimators': 75}, 0.6254436458009907)


In [18]:
model, best_params, best_score = select_model_verbose(gridsearch)

new_point = np.array([[3.2031, 52., 5.47761194, 1.07960199, 910., 2.26368159, 37.85, -122.26]])

model.predict(new_point)

array([2.58939561])