# Exercises 0

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.optimizers import SGD
from keras.wrappers.scikit_learn import KerasRegressor

### 0. MLP for regression (*)
We will continue with the dataset that we used in the lecture with predicting miles per gallons using an MLP for regression.

a) Load the mpg dataset using seaborn. (*)



In [None]:
mpg = sns.load_dataset("mpg").drop("name", axis=1)
mpg

b) Use your data analysis skills to perform EDA. (*)

In [None]:
mpg.describe()

In [None]:
mpg.info() #Six missing values for horsepower

In [None]:
#Plotting the data

fig, axes = plt.subplots(2, 2, figsize=(8, 6), dpi=100)

for ax, feature in zip(axes.flatten(), mpg.columns[2:6]):
  sns.scatterplot(data=mpg, x=feature, y="mpg", ax = ax)

fig.tight_layout()
_ = fig.suptitle("Possible predictors for mpg", y=1.03, fontweight="bold")

In [None]:
sns.pairplot(mpg, corner=True, height = 2)

c) Find out the missing values in the dataset and use a machine learning model to fill them in (imputation). (**)

In [None]:
# Dummy code origin to be able to use it as a predictor
mpg = pd.get_dummies(mpg, columns=["origin"], drop_first=True)
mpg

In [None]:
#Checks the rows containing missing values and remove horsepower
rows_to_impute = mpg.query("horsepower.isna()").drop("horsepower", axis=1)
rows_to_impute

In [None]:
# Drop the missing values from the full dataset and split it into X and y
mpg_dropped_missing = mpg.dropna()
X, y = mpg_dropped_missing.drop("horsepower", axis = 1), mpg_dropped_missing["horsepower"]

In [None]:
# Predict and impute the values
model_lin_reg = LinearRegression()
model_lin_reg.fit(X, y)
rows_to_impute["horsepower"] = model_lin_reg.predict(rows_to_impute)
rows_to_impute

In [None]:
# Merge the two dataframes
mpg_imputed = pd.concat([mpg_dropped_missing, rows_to_impute]).sort_index()
mpg_imputed

d) Can you figure out a way to see if the values filled in are reasonable? (**)

I can for example check MAE and RMSE for how well the model predicts horsepower. As can be seen below, the model is pretty good at predicting horsepower. I therefore believe that the values filled in are reasonable. However, this assumption, is based on that the values are missing at random. 

Also, we should be a bit cautious when using this approach on too many values. We have now removed the noise for the imputed values. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

In [None]:
# Create model and make predictions
model_lin_reg = LinearRegression()
model_lin_reg.fit(X_train, y_train)
y_pred = model_lin_reg.predict(X_test)

# Evaluate
MAE = mean_absolute_error(y_test, y_pred)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"MAE: {MAE:.2f}")
print(f"RMSE: {RMSE:.2f}")
print(f"Horsepower range: {y_test.max() - y_test.min()}")
print(f"Horsepower SD: {y_test.std():.2f}")

e) Do a train|val|test split on the data and scale it properly. Test out which scaling method to use. (*)

In [None]:
X, y = mpg_imputed.drop("mpg", axis=1).values, mpg_imputed["mpg"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

In [None]:
standard_scaler = StandardScaler()
min_max_scaler = MinMaxScaler()

X_train_stand = standard_scaler.fit_transform(X_train)
X_val_stand = standard_scaler.transform(X_val)
X_train_norm = min_max_scaler.fit_transform(X_train)
X_val_norm = min_max_scaler.transform(X_val)

model_lin_reg = LinearRegression()
model_lin_reg.fit(X_train_stand, y_train)
y_pred = model_lin_reg.predict(X_val_stand)
MAE = mean_absolute_error(y_val, y_pred)
RMSE = np.sqrt(mean_squared_error(y_val, y_pred))
print(MAE, RMSE)

model_lin_reg = LinearRegression()
model_lin_reg.fit(X_train_norm, y_train)
y_pred = model_lin_reg.predict(X_val_norm)
MAE = mean_absolute_error(y_val, y_pred)
RMSE = np.sqrt(mean_squared_error(y_val, y_pred))
print(MAE, RMSE)

f) Create an MLP with hidden layers, 1-3, and test out different amount of nodes. Choose the number of epochs you want to use throughout all experiments. Plot training losses and validation losses for different configurations. (*)

In [None]:
def model_MLP(number_of_layers, number_of_nodes):
    model_MLP = Sequential(name = "MLP") # Here we add the input layer
    
    model_MLP.add(InputLayer(X_train_stand.shape[1])) # We specify the number of features
    
    model_MLP.add(Dense(20, name="Hidden_layer_1")) # Hidden layers
    if number_of_layers == 2 or number_of_layers == 3:
        model_MLP.add(Dense(number_of_nodes, name="Hidden_layer_2")) # Hidden layers
        if number_of_layers == 3:
            model_MLP.add(Dense(number_of_nodes, name="Hidden_layer_3")) # Hidden layers

    model_MLP.add(Dense(1, name = "Output_layer")) # Note no activation function --> linear activation

    model_MLP.compile(loss = "mean_squared_error", optimizer = SGD(learning_rate=.01))
    
    return model_MLP

model_MLP = KerasRegressor(build_fn=model_MLP, verbose=1)

In [None]:
pipeline_MLP = Pipeline([
                        ('scaler', None),
                        ('model_MLP', model_MLP)
]) 

param_grid_MLP = {
                "scaler" : [StandardScaler(), MinMaxScaler()],
                "model_MLP__number_of_layers" : [1, 2, 3],
                "model_MLP__number_of_nodes" : [10, 50, 100]
} 

CV_results_MLP = GridSearchCV(estimator=pipeline_MLP, param_grid=param_grid_MLP, cv=5, verbose=1)

In [117]:
CV_results_MLP.fit(X_train, y_train)

In [None]:
CV_results_MLP.best_params_

In [None]:
model_MLP = Sequential(name = "MLP") # Here we add the input layer
model_MLP.add(InputLayer(X_train_stand.shape[1])) # We specify the number of features
model_MLP.add(Dense(20, name="Hidden_layer_1")) # Hidden layers
model_MLP.add(Dense(1, name = "Output_layer")) # Note no activation function --> linear activation
model_MLP.compile(loss = "mean_squared_error", optimizer = SGD(learning_rate=.01))
model_MLP.fit(X_train_stand, y_train, epochs = 50, verbose = 1, validation_data=(X_val_stand, y_val))

g) Now use early stopping to tune the number of epochs. (*)

h) Train on all training data and validation data. (*)

i) Predict on test data and evaluate. (*)

j) Can you create an MLP model that beats random forest for this dataset? (**)