# Solutions for Supervised Machine Learning

In [None]:
%matplotlib inline

## Feature Engineering

In [None]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.preprocessing import PolynomialFeatures

In [None]:
boston = load_boston()
df = pd.DataFrame(boston["data"], columns=boston["feature_names"])
df.head()

In [None]:
poly = PolynomialFeatures(degree=2, include_bias=False)
polynomials = poly.fit_transform(df)
print(polynomials.shape)

In [None]:
out = pd.DataFrame(polynomials)
out.columns = poly.get_feature_names(df.columns)
out["y"] = boston["target"]
print(out.head())
print(list(out.columns))
out.to_csv(".out/polynomials.csv")

## Regularization

In [None]:
import pandas as pd
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv(".out/polynomials.csv")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop("y", axis=1), df["y"])

In [None]:
ridge = Ridge().fit(X_train, y_train)
lasso = Lasso().fit(X_train, y_train)
print(ridge.score(X_train, y_train))
print(lasso.score(X_train, y_train))

In [None]:
coefs = pd.DataFrame({"ridge": ridge.coef_, "lasso": lasso.coef_}, index=df.columns[:-1])
coefs[(coefs["ridge"] != 0) & (coefs["lasso"] == 0)].shape[0]

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 30))
coefs.plot.barh(ax=ax)

## Random Forest Classification

In [None]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

In [None]:
import pandas as pd

df = pd.DataFrame(cancer['target'], columns=["y"])
df["y"].value_counts().plot.bar()

In [None]:
from sklearn.model_selection import train_test_split

X = cancer['data']
y = cancer['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {"n_estimators": [10, 50, 100]}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

In [None]:
from sklearn.metrics import confusion_matrix

best = grid_search.best_estimator_
preds = best.predict(X_test)
print(preds)
confusion_m = pd.DataFrame(confusion_matrix(y_test, preds))
print(confusion_m)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print(accuracy_score(y_test, preds))
print(precision_score(y_test, preds))
print(recall_score(y_test, preds))
print(f1_score(y_test, preds))

In [None]:
import pandas as pd

df = pd.DataFrame(best.feature_importances_, columns=["importances"], index=cancer["feature_names"])
df["importances"].plot.bar()

## Neural Networks Classification

In [None]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X = cancer['data']
y = cancer['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

pipe = Pipeline([("scaler", MinMaxScaler()),
                 ("nn", MLPClassifier(max_iter=10000, solver="lbfgs", activation="tanh"))])
param_grid = {"nn__hidden_layer_sizes": [(20, 10), (20, 20)],
              "nn__alpha": [0.01, 0.001]}
grid = GridSearchCV(pipe, param_grid, cv=5, return_train_score=True, scoring="recall")
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix

best = grid.best_estimator_
preds = best.predict(X_test)
confusion_m = pd.DataFrame(confusion_matrix(y_test, preds))
sns.heatmap(confusion_m, annot=True)

In [None]:
df = pd.DataFrame(best._final_estimator.coefs_[0], index=cancer["feature_names"])
sns.heatmap(df)

## Neural Network Regression

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

diabetes = load_diabetes()
print(diabetes['DESCR'])

X = diabetes['data']
y = diabetes['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
print(y_train)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipe = Pipeline([("scaler", StandardScaler()),
                 ("nn", MLPRegressor(max_iter=10000, solver="lbfgs", activation="tanh"))])
param_grid = {"nn__hidden_layer_sizes": [(20, 10), (20, 20), (50, 20)],
              "nn__alpha": [0.005, 0.001, 0.0001]}
grid = GridSearchCV(pipe, param_grid, cv=4, return_train_score=True, scoring="roc_auc")
grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

In [None]:
from numpy import array

scores = array(grid.cv_results_["mean_test_score"]).reshape(3, 3)
sns.heatmap(scores, annot=True, xticklabels=param_grid["nn__hidden_layer_sizes"], yticklabels=param_grid["nn__alpha"])

In [None]:
import pandas as pd
import seaborn as sns

best = grid.best_estimator_
df = pd.DataFrame(best._final_estimator.coefs_[0], index=diabetes["feature_names"])
sns.heatmap(df)