In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn.model_selection import cross_validate, train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from category_encoders import TargetEncoder
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imb_Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
from datacademy.modules import Module07

module = Module07()

In [None]:
df = pd.read_parquet("data/adult.parquet")

<hr>

## A. Data Understanding
First we will get acquainted with the data, for which you have to follow the steps outlined in `Easy-LMS`. In between steps we allow you to validate the shape of your data frame, which enables you to check whether you executed the previous steps correctly. To do this, simply pass the `list(df.shape)` into the checker function, for which the code will be supplied.

In [None]:
#TODO: Investigate the first rows of the data frame using .head()
...

In [None]:
#TODO: Analyse the numerical values of the data frame using .describe()
...

In [None]:
#TODO: Analyse the different columns of the data frame using .info()
...

In [None]:
#TODO: Generate a pairplot using the Seaborn library.
...

In [None]:
#TODO: Print a box plot for both the 'capital gain' and 'capital-loss' columns.
for num_col in ['capital-gain', 'capital-loss']:
    ...

In [None]:
#TODO: Remove the outliers in the 'capital-gain' and 'capital-loss' columns.
print(df.shape)
df = df[df['capital-gain'] < ...]
df = df[df['capital-loss'] < ...]
print(df.shape)

#### A1 - Validate Data Frame
Submit the shape of the data frame in the checker function below. Make sure to pass it as a list, using `list(df.shape)`, as our checker function is build to only work with lists.

In [None]:
#TODO: Submit the data frame shape to analyse if the outlier removal step is executed correctly.
module.check("E3_A1", list(df.shape))

In [None]:
#TODO: Extract column names for the numerical and categorical columns.
numerical_columns = ...
categorical_columns = ...

In [None]:
#TODO: Loop through the categorical columns and print the number of unique categories.
for col in categorical_columns:
    print(f'{...}: {...}')

In [None]:
#TODO: Print the amount of unique values in the 'education_num' column.
...

In [None]:
#TODO: Print the first 10 rows of the 'education_num' en 'education' columns.
df[['...', '...']].head(10)

<hr>

## B. Data Preparation
Now we have an understanding of our data, we can continue with preparing our data. The steps to do this are outlined in `Easy-LMS`, so follow these accordingly. In between you can check your data frame shapes in a similar manner as before, to validate if you executed the steps correctly.

In [None]:
#TODO: Remove the 'education_num' and 'fnlwgt' columns and print first 5 rows.
df.drop(...)
df.head()

In [None]:
#TODO: Transform the categorical values of the 'sex' column into binary values (0/1).
genders = list(df['sex'].unique())
df['sex'] = [genders.index(x) for x in df['sex']]

In [None]:
#TODO: Transform the categorical values of the 'class' column into binary values (0/1).
class_labels = ...
df['class'] = ...

In [None]:
#TODO: Print the first 5 rows to investigate whether they show the correct 'sex' and 'class' columns.
df.head()

#### B1 | B2 - Validate Data Frame
Run the check functions below to check whether the `sex` and `class` columns are encoded correctly.

In [None]:
#TODO: Send the Counter of the 'sex' column to validate if it is constructed correctly.
module.check("E3_B1", dict(Counter(df['sex'])))

In [None]:
#TODO: Send the Counter of the 'class' column to validate if it is constructed correctly.
module.check("E3_B2", dict(Counter(df['class'])))

In [None]:
#TODO: Create two lists of column names that describe which encoding is applied.
target_encoding_columns = ['education', 'occupation', 'native-country']
onehot_encoding_columns = ['workclass', 'relationship', 'marital-status', 'race']

In [None]:
#TODO: Complete the function so that it return a pipeline with a 'preprocessor' and 'clf' step.
def create_pipeline(target_enc_cols:list, onehot_enc_cols:list, scaler, clf) -> Pipeline:
    """
    Take the imputer, scaler, encoder and classifier and create and return a sklearn pipeline.

    Args:
        target_enc_cols (list): List of column names that need to be taken through Target Encoding.
        onehot_enc_cols (list): List of column names that need to be taken through One-Hot Encoding.
        scaler (_type_): Scaling module, used to scale the data to a set range of values.
        clf (_type_): Classification model, which can be any model from the sklearn classification model catalog.

    Returns:
        Pipeline: Pipeline containing all preprocessing and classification models.
    """
    preprocessor = make_column_transformer(
        (..., ...),
        (..., ...),
        remainder=...
    )
    
    return Pipeline(steps=[('preprocess', preprocessor), ('clf', clf)])

In [None]:
#TODO: Create and print a pipeline to check the architecture
pipe = create_pipeline(
    target_enc_cols=target_encoding_columns,
    onehot_enc_cols=onehot_encoding_columns,
    scaler=MinMaxScaler(),
    clf=LogisticRegression(n_jobs=-1)
)

pipe

<hr>

## C|D. Modeling and Evaluation
Enough of the data preprocessing, it is time to develop and train some models! We will use the library `Scikit-Learn` to do so. This library allows the user to easily switch between models, as all models have a `.fit()` and `.predict()` function. Please make sure that during initialisation (if possible) you set:
* `n_jobs` = -1, to increase speed through `parallel computation`. <br>
* `random_state` = 0, to fixate the end result. <br>

By following the steps outlined in `Easy-LMS` we will eventually develop three models, namely: 
* `Logistic Regression`; <br>
* `SVC`; <br>
* `RandomForestClassifier`.

In [None]:
#TODO: Split our data set into independent (X) and dependent (y) variables.
y = df['class']
X = df.drop(['class'], axis=1)

In [None]:
#TODO: Print the first 5 rows of the X data set.
X.head()

In [None]:
#TODO: Print the first 5 rows of the y data set.
y.head()

#### C1 - Validate Data Frame
Run the check functions below to check whether the `independent (X)` and `dependent (y)` data sets are created correctly.

In [None]:
#TODO: Send the shapes of both the independent (X) and dependent (y) data sets to the checker function.
module.check("E3_C1", [list(X.shape), list(y.shape)])

In [None]:
#TODO: Fill the list below with all models | Make sure to add n_jobs and random_state where possible (hint: Look at documentation)
models = [
    ..., 
    ..., 
    ...
    ]

#TODO: Loop over list of models, create a pipeline and execute 5-fold cross validation.
for model in models:
    pipe = create_pipeline(
        target_enc_cols=...,
        onehot_enc_cols=...,
        scaler=...,
        clf = ...
    )

    cv_results = cross_validate(
        estimator=..., 
        X=..., y=..., 
        cv=..., scoring='...')
    
    print(f'{pipe["clf"].__class__.__name__} test scores: {[round(x,3) for x in cv_results["test_score"]]} --> Average: {round(np.mean(cv_results["test_score"]), 3)}, st.dev.: {round(np.std(cv_results["test_score"]), 3)}')

#### D1 - Validate model performances
Run the check functions below to check whether the `model performances` are similar to ours.

In [None]:
#TODO: Fill in the performance of the models and send it to the checker function.
model_performances = {
    "LogisticRegression": ...,
    "SVC": ...,
    "RandomForestClassifier": ...
}

module.check("E3_D1", model_performances)

In [None]:
#TODO: Create a stratified train/test split | Please make sure to include the random_state.
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

#### D2 - Validate train/test split
Run the check functions below to check whether the `train/test split` is created correctly.

In [None]:
#TODO: Send the counters of both the y_train as the y_test to the checker function.
module.check("E3_D2", [dict(Counter(y_train)), dict(Counter(y_test))])

In [None]:
#TODO: Fill the list below with all models | Make sure to add n_jobs and random_state where possible (hint: Look at documentation)
models = [
    LogisticRegression(n_jobs=-1),
    SVC(random_state=0),
    RandomForestClassifier(n_jobs=-1, random_state=0)
]

labels = ["<= 50k", "> 50k"]

#TODO: Loop over models, train and predict and visualize results in Confusion Matrices.
for model in models:
    pipe = create_pipeline(
        target_enc_cols = target_encoding_columns,
        onehot_enc_cols = onehot_encoding_columns,
        scaler = MinMaxScaler(),
        clf = model
    )

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
    df_cm = pd.DataFrame(cm, index=labels, columns=labels)

    plt.figure(figsize=(3, 3))
    plt.title(f"{pipe['clf'].__class__.__name__}")
    sns.heatmap(df_cm, annot=True, fmt='g', cbar=False)
    plt.show()

In [None]:
#TODO: Fill the list below with all models | Make sure to add n_jobs and random_state where possible (hint: Look at documentation)
models = [
    LogisticRegression(n_jobs=-1),
    SVC(random_state=0),
    RandomForestClassifier(n_jobs=-1, random_state=0)
]

#TODO: Loop over models, train and predict and create a classification report.
for model in models:
    pipe = create_pipeline(
        target_enc_cols = target_encoding_columns,
        onehot_enc_cols = onehot_encoding_columns,
        scaler = MinMaxScaler(),
        clf = model
    )

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    print(f"-- Model: {pipe['clf'].__class__.__name__} -- \n")
    print(classification_report(
        y_true = y_test,
        y_pred = y_pred,
        labels = [0, 1],
        target_names = ["<= 50k", "> 50k"]
    ))
    print('\n')

<hr>

## E. Improve Model Performance
To improve the model performance we first will deal with the fact that our data set is `imbalanced`. This implies that looking at both possible classes, the one is more present than the other. If this difference becomes significantly large, it might `bias` our results. For example, when a single class is present in 90% of rows, a model could reach a perfomance of 90% by only classifying this class.

To fix the imbalanced data set we will use the `imblearn` library. We will apply the `RandomOverSampler` and `RandomUnderSampler`, which are the simplest way to increase or decrease the number of occurences of a given class. Please follow the steps outlined in `Easy-LMS`, which will guide you through the entire process.

In [None]:
print(f'Before applying balancing, our counts look like: {Counter(y)}')

#TODO: Apply Random Over Sampling | Please make sure random_state is set to 0.
ros = RandomOverSampler(random_state=0)
X_ros, y_ros = ros.fit_resample(X, y)
print(f'After applying ROS, our counts now look like: {Counter(y_ros)}')

#TODO: Apply Random Under Sampling | Please make sure random_state is set to 0.
rus = RandomUnderSampler(random_state=0)
X_rus, y_rus = rus.fit_resample(X, y)
print(f'After applying RUS, our counts now look like: {Counter(y_rus)}')

In [None]:
#TODO: Set the distributions for our Cross Validated Randomized Search.
distributions = {
    'clf__n_estimators': np.arange(start=..., stop=..., step=..., dtype=...),
    'clf__max_depth': list(np.arange(start=..., stop=..., step=..., dtype=...)) + [...],
    'clf__max_features': np.arange(start=..., stop=..., step=..., dtype=...),
    'clf__criterion': ['...','...'],
    'clf__min_samples_leaf': np.arange(start=..., stop=..., step=..., dtype=...),
    'clf__min_samples_split': np.arange(..., ..., step=...)
}

In [None]:
#TODO: Create the pipeline with Random Forest as the classifier
pipe = create_pipeline(
        target_enc_cols=...,
        onehot_enc_cols=...,
        scaler=...,
        clf = ...(n_jobs=-1, random_state=0)
    )

In [None]:
#TODO: Execute Cross Validated Randomized Search. 
random_search_cv = RandomizedSearchCV(
    estimator=pipe, 
    param_distributions=distributions,
    n_iter=100,
    scoring='f1',
    n_jobs=-1,
    cv=3,
    random_state=0
    ).fit(X_ros, y_ros)

#TODO: Transform our results into a Pandas Data Frame.
gridsearchResults = pd.DataFrame(
    data={
        'max_depth': random_search_cv.cv_results_['param_clf__max_depth'], 
        'n_estimators': random_search_cv.cv_results_['param_clf__n_estimators'], 
        'max_features': random_search_cv.cv_results_['param_clf__max_features'],
        'criterion': random_search_cv.cv_results_['param_clf__criterion'],
        'min_samples_leaf': random_search_cv.cv_results_['param_clf__min_samples_leaf'],
        'min_samples_split': random_search_cv.cv_results_['param_clf__min_samples_split'],
        'mean_test_score': random_search_cv.cv_results_['mean_test_score'],
        'std_test_scores': random_search_cv.cv_results_['std_test_score']
        })

In [None]:
gridsearchResults.sort_values(by="mean_test_score", ascending=False)