In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from collections import Counter

In [None]:
from datacademy.modules import Module07

module = Module07()

In [None]:
df = pd.read_parquet("data/Metro_Interstate_Traffic_Volume.parquet")

<hr>

## A. Data Understanding
First we will get acquainted with the data, for which you have to follow the steps outlined in `Easy-LMS`. In between steps we allow you to validate the shape of your data frame, which enables you to check whether you executed the previous steps correctly. To do this, simply pass the `list(df.shape)` into the checker function, for which the code will be supplied.

In [None]:
#TODO: Investigate the first rows of the data frame using .head()
...

In [None]:
#TODO: Analyse the numerical values of the data frame using .describe()
...

In [None]:
#TODO: Extract column names for the numerical and categorical columns.
numerical_columns = ...
categorical_columns = ...

In [None]:
#TODO: Generate a pairplot using the Seaborn library.
...

In [None]:
#TODO: Remove the outliers in the 'temp' and 'rain_1h' columns.
df = df[df['temp'] > ...]
df = df[df['rain_1h'] < ...]

#### A1 - Validate Data Frame
Submit the shape of the data frame in the checker function below. Make sure to pass it as a list, using `list(df.shape)`, as our checker function is build to only work with lists.

In [None]:
#TODO: Evaluate the shape of the data frame with the checker function below.
module.check("E2_A1", list(df.shape))

In [None]:
#TODO: Again generate a pairplot using the Seaborn library to see the difference.
...

In [None]:
#TODO: Print a box plot for all the numerical columns in the data frame.
for num_col in numerical_columns:
    plt.subplots()
    plt.boxplot(x=...)
    plt.title(f"Boxplot: {num_col}")
    plt.show()

In [None]:
#TODO: Print all categorical columns of the data frame.
...

In [None]:
#TODO: Print the number of unique values in all category columns.
for col in categorical_columns:
    print(f"{...} --> {...}")

In [None]:
#TODO: Print the unique values in the 'holiday' column using the .unique() function of Pandas.
...

<hr>

## B. Data Preparation
Now we have an understanding of our data, we can continue with preparing our data. The steps to do this are outlined in `Easy-LMS`, so follow these accordingly. In between you can check your data frame shapes in a similar manner as before, to validate if you executed the steps correctly.

In [None]:
#TODO: Run the script below to transform the 'date_time' column into a useable format.
df['date_time'] = pd.to_datetime(df['date_time'], format="%Y-%m-%d %H:%M:%S")

df['date'] = df['date_time'].dt.date
df['time'] = df['date_time'].dt.time

df['DoW'] = df['date'].apply(lambda x: x.weekday())
df['HoD'] = df['time'].apply(lambda x: x.hour)

df.head()

#### B1 | B2 - Validate Data Frame
Run the check functions below to check whether the `Day of Week (DoW)` and `Hour of Day (HoD)` columns are constructed correctly.

In [None]:
#TODO: Use the checker function to validate the 'DoW' column.
module.check("E2_B1", dict(Counter(df['DoW'])))

In [None]:
#TODO: Use the checker function to validate the 'HoD' column.
module.check("E2_B2", dict(Counter(df['HoD'])))

In [None]:
#TODO: Create a column names 'is_holiday' which contains a 1 if the row concerns a holiday.
df['is_holiday'] = df['holiday'].notnull().astype(int)

#### B3 - Validate Data Frame
Run the check functions below to check whether the `is_holiday` column is constructed correctly.

In [None]:
#TODO: Use the checker function to validate the 'is_holiday' column.
module.check("E2_B3", dict(Counter(df['is_holiday'])))

In [None]:
#TODO: Print the unique holidays for both the 'is_holiday' equal to 1 and 0 separately.
print(...)
print(...)

In [None]:
#TODO: Remove the columns: 'holiday', 'date_time', 'date', 'time' and 'weather_description'.
df.drop(...)
df.head()

In [None]:
#TODO: Split the data frame in independent (X) and dependent (y) data sets.
X = ...
y = ...

In [None]:
#TODO: Create a train-test split.
X_train, X_test, y_train, y_test = ...

#### B4 - Validate Train and Test data.
Run the check functions below to check whether the `X_train`, `X_test`, `y_train` and `y_test` data sets are constructed correctly.

In [None]:
#TODO: Validate the created train and test set using the checker function.
module.check("E2_B4", [X_train.shape, X_test.shape, y_train.shape, y_test.shape])

In [None]:
#TODO: Again extract column names for the numerical and categorical columns to include changes made.
numerical_columns = ...
categorical_columns = ...

In [None]:
#TODO: Split the categorical and numerical columns in both the train and test set.
cat_cols_train = X_train[categorical_columns]
num_cols_train = X_train[numerical_columns]

cat_cols_test = X_test[categorical_columns]
num_cols_test = X_test[numerical_columns]

In [None]:
#TODO: Apply Min-Max Scaling to the categorical columns, prevent data leakage by only fitting on the train data.
scl = ...

scaled_train = ...

scaled_test = ...

#### B5 - Validate Scaled Data.
Run the check functions below to check whether the `scaled_train` and `scaled_test` data sets are constructed correctly.

In [None]:
#TODO: Send the statistics of both the train and test set to evaluate their validity.
train_stats = {
    "min": min(scaled_train.min(axis=0)),
    "avg": np.mean(np.mean(scaled_train, axis=0)),
    "max": max(scaled_train.max(axis=0))
}

test_stats = {
    "min": min(scaled_test.min(axis=0)),
    "avg": np.mean(np.mean(scaled_test, axis=0)),
    "max": max(scaled_test.max(axis=0))
}

module.check("E2_B5", [train_stats, test_stats])

In [None]:
#TODO: Apply onehot-encoding to the categorical columns, again prevent data leakage by only fitting on the train data.
enc = ...

OH_train = ...

OH_test = ...

#### B6 - Validate Encoded Data.
Run the check functions below to check whether the `OH_train` and `OH_test` data sets are constructed correctly.

In [None]:
#TODO: Run the checker function below to evaluate if the encoding is done correctly.
module.check("E2_B6", [int(np.sum(OH_train)), int(np.sum(OH_test))])

In [None]:
#TODO: Recombine the numerical and categorical columns of both the train and test set.
X_train = ...
X_test = ...

In [None]:
#TODO: Retrieve the column names and created a prepared_df to visualize the data.
column_names = ...
prepared_df = pd.DataFrame(data={column_names[i]: X_train[:,i] for i in range(len(column_names))})
prepared_df.head()

<hr>

## C. Modeling and Evaluation
Enough of the data preprocessing, it is time to develop and train some models! We will use the library `Scikit-Learn` to do so. This library allows the user to easily switch between models, as all models have a `.fit()` and `.predict()` function. Please make sure that during initialisation (if possible) you set:
* `n_jobs` = -1, to increase speed through `parallel computation`. <br>
* `random_state` = 0, to fixate the end result. <br>

By following the steps outlined in `Easy-LMS` we will eventually develop three models, namely: 
* `Linear Regression`; <br>
* `Random Forest Regressor`; <br>
* `Supper Vector Regressor (SVR)`.

In [None]:
#TODO: Fill the list below with all models | Make sure to add n_jobs and random_state where possible (hint: Look at documentation)
models = [
    LinearRegression(n_jobs=-1),
    SVR(),
    RandomForestRegressor(n_jobs=-1, random_state=0)
]

#TODO: Loop over the list of models and evaluate their performance using the r2-score.
for model in models:
    model.fit(...)
    y_pred = ...
    performance = ...
    print(f'r2_score {type(model).__name__}: {round(performance, 2)}')

#### C1 - Validate R2 Scores.
Fill in the `r2 scores` that you printed above in the check functions below. It will validate whether the performance of the trained models correspond with our answers.

In [None]:
#TODO: Fill the dictionary below and evaluate if your models are configured and trained correctly.
r2_scores = {
    "LinearRegression": ...,
    "SVR": ...,
    "RandomForestRegressor": ...
    }

module.check("E2_C1", r2_scores)

<hr>

## D. Modeling and Evaluation (Advanced)
The process we just executed works, but is prone to potential `biases` in the data. For example, a favorable training set could increase the r2 score significantly. <br>

To prevent this from happening we will implement `k-fold Cross Validation`, which trains the model on multiple train/test splits and returns an average result. However, as our process contained a lot of manual steps, executing these different splits by hand would be a cumbersome task. <br>

For this reason we will replace our manual steps with a so-called `Pipeline`, which we construct using `Scikit-Learn`. This will enable us to evaluate the models using the `cross-validate()` function, also from Scikit-learn.

In [None]:
#TODO: Finalize the function below to return a pipeline containing preprocessing steps and a regression model.
def create_pipeline(scl, enc, reg) -> Pipeline:
    """Take the scaler, encoder and regressor and create and return a sklearn pipeline.

    Args:
        scl (_type_): Scaling module, used to scale the numerical data to a set range of values.
        enc (_type_): Encoding module, used to transform categorical values to a workable format.
        reg (_type_): Regression model, which can be any model from the sklearn regression model catalog.

    Returns:
        Pipeline: Pipeline containing all needed preprocessing steps and Regression model.
    """
    numerical_pipe = ...
    categorical_pipe = ...

    preprocessor = ...

    return Pipeline(steps=[("...", ...), ("...", ...)])

In [None]:
#TODO: Create an example pipeline and visualize it.
pipe = create_pipeline(
    scl=MinMaxScaler(),
    enc=OneHotEncoder(),
    reg=RandomForestRegressor(random_state=0, n_jobs=-1)
)

#TODO: Apply 5-fold cross validaation on the created pipeline.
cv_results = cross_validate(
    estimator=..., 
    X=..., y=..., 
    cv=..., scoring='...', 
    n_jobs=...
    )

In [None]:
#TODO: Print the model performance using the code below.
print(f"Performance of different folds: {[round(score, 2) for score in cv_results['test_score']]}")
print(f"Average performance: {round(np.mean(cv_results['test_score']), 2)}")
print(f"Standerd deviation: {round(np.std(cv_results['test_score']), 2)}")

#### D1 - Validate 5-fold Cross Validation Scores.
Fill in the scores below to check whether your `cross_validate()` run returns similar values as our algorithms did.

In [None]:
#TODO: Fill the dictionary below and validate if you executed cross validation succesfully.
cross_validate_scores = {
    "Fold_Performance": [..., ..., ..., ..., ...],
    "Average_Performance": ...,
    "Standard_Deviation": ...
}

module.check("E2_D1", cross_validate_scores)

<hr>

## E. Hyperparameter Optimization:
Through elaborate testing of different models on our data set we define which model best suits the underlying data set. Normally these comparisons are done with the `hyperparameters` set to the `default` values. However, when we intend to put a model into operation, we want a model that is optimized towards the application. This can be attained through `hyperparameter optimization`, which we will conduct using `Grid Search`.

In [None]:
#TODO: Develop parameter dictionary, create pipeline and apply Cross Validated Grid Search
parameters = {
    "reg__n_estimators": [...], 
    "reg__max_depth": [...]
    }

pipe = create_pipeline(
    scl=MinMaxScaler(),
    enc=OneHotEncoder(),
    reg=RandomForestRegressor(random_state=0, n_jobs=-1)
)

grid_search = GridSearchCV(
    estimator=..., 
    param_grid=..., 
    cv=..., scoring="...", 
    n_jobs=...
    )

grid_search.fit(X, y)

In [None]:
#TODO: Print the keys of the resulting dictionary using '.keys()'
grid_search.cv_results_.keys()

In [None]:
#TODO: Create a Pandas data frame with the grid search performance.
gridsearch_results = pd.DataFrame(
    data={
        "max_depth": grid_search.cv_results_["param_reg__max_depth"],
        "n_estimators": grid_search.cv_results_["param_reg__n_estimators"],
        "mean_test_score": grid_search.cv_results_["mean_test_score"],
        "std_test_scores": grid_search.cv_results_["std_test_score"],
        "mean_fit_time": grid_search.cv_results_["mean_fit_time"],
        "mean_score_time": grid_search.cv_results_["mean_score_time"],
    }
)
gridsearch_results

#### E1 - Validate Grid Search Results.
Fill in the requested values of the best performing configuration based on `mean_test_score` below.

In [None]:
#TODO: Fill in the dictionary below and evaluate if you executed grid search succesfully.
best_performing_configuration = {
    "max_depth" : None,
    "n_estimators" : 1000
}

module.check("E2_E1", best_performing_configuration)

In [None]:
#TODO: Fit a new pipeline using the optimized parameter settings.
optimized_pipe = create_pipeline(
    scl=MinMaxScaler(),
    enc=OneHotEncoder(),
    reg=RandomForestRegressor(
        n_estimators=1000, max_depth=None, random_state=0, n_jobs=-1
    )
).fit(X, y)

optimized_pipe

In [None]:
#TODO: Use the function below to print the feature importances of our model.
def plot_feature_importance(feature_importance:np.array, feature_names:np.array) -> None:

    data={
        "feature_names": feature_names,
        "feature_importance": feature_importance
        }
    fi_df = pd.DataFrame(data)

    fi_df.sort_values(by=["feature_importance"], ascending=False, inplace=True)

    plt.figure()
    sns.barplot(x=fi_df["feature_importance"], y=fi_df["feature_names"])
    plt.title("Random Forest Regressor - Feature Importances")
    plt.xlabel("Feature Importances")
    plt.ylabel("Feature Name")

plot_feature_importance(
    feature_importance=optimized_pipe["reg"].feature_importances_,
    feature_names=column_names
)

In [None]:
#TODO: Print the actual numerical values for the feature importances.
feature_importances = optimized_pipe['reg'].feature_importances_
for i in range(len(feature_importances)):
    print(f"{column_names[i]} --> {feature_importances[i]}")