# PART 0. **Import external Packages**

In [None]:
import json
import calendar

# Math
import numpy as np
import math
import warnings

warnings.filterwarnings("ignore")

# Data Processing
import pandas as pd
import scipy.stats as stats
import geopandas as geopd
import country_converter as coco
from functools import reduce

from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import KNNImputer
from datetime import datetime, timedelta, date

# Data Visualization
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt

# Machine Learning Models
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import (
    GradientBoostingClassifier,
    RandomForestClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
    BaggingClassifier,
)
import xgboost as xgb

# Optimization
from sklearn.model_selection import KFold
from sklearn.model_selection import (
    RepeatedStratifiedKFold,
    GridSearchCV,
    cross_val_score,
)

# Evaluation
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    roc_auc_score,
    make_scorer,
    roc_curve,
    f1_score,
)


## **Import Datasets**

In [None]:
exam_info = pd.read_csv("365_database/365_exam_info.csv")
course_info = pd.read_csv("365_database/365_course_info.csv")
student_info = pd.read_csv("365_database/365_student_info.csv")
course_ratings = pd.read_csv("365_database/365_course_ratings.csv")
student_learning = pd.read_csv("365_database/365_student_learning.csv")
student_exams = pd.read_csv("365_database/365_student_exams.csv")
quiz_info = pd.read_csv("365_database/365_quiz_info.csv")
student_quizzes = pd.read_csv("365_database/365_student_quizzes.csv")
student_hub_questions = pd.read_csv("365_database/365_student_hub_questions.csv")
student_engagement = pd.read_csv("365_database/365_student_engagement.csv")
student_purchases = pd.read_csv("365_database/365_student_purchases.csv")
world_df = geopd.read_file("365_database/countries.geojson")
world_json = json.load(open("365_database/countries.geojson", "r"))
state_id_map = eval(open('365_database/state_id_map.json').read());

id_count = 0
for feature in world_json["features"]:
    feature["id"] = id_count
    id_count += 1

### **Tables OVERVIEW**

- **exam_info:** 'exam_id', 'exam_category', 'exam_duration'
- **course_info:** 'course_id', 'course_title'
- **student_info:** 'student_id', 'student_country', 'date_registered'
- **course_ratings:** 'course_id', 'student_id', 'course_rating', 'date_rated'
- **student_learning:** 'student_id', 'course_id', 'minutes_watched', 'date_watched'
- **student_exams:** 'exam_attempt_id', 'student_id', 'exam_id', 'exam_result', 'exam_completion_time', 'date_exam_completed'
- **quiz_info:** 'quiz_id', 'question_id', 'answer_id', 'answer_correct'
- **student_quizzes:** 'student_id', 'quiz_id', 'question_id', 'answer_id'
- **student_hub_questions:** 'hub_question_id', 'student_id', 'date_question_asked'
- **student_engagement:** 'engagement_id', 'student_id', 'engagement_quizzes', 'engagement_exams', 'engagement_lessons', 'date_engaged'
- **student_purchases:** 'purchase_id', 'student_id', 'purchase_type', 'date_purchased'

## **Functions**

In [None]:
def filter_by_dates(filt_df, range_dates, date_column_name):
    filt_df[date_column_name] = pd.to_datetime(filt_df[date_column_name])
    mask = (filt_df[date_column_name] >= pd.to_datetime(range_dates[0])) & (
        filt_df[date_column_name] <= pd.to_datetime(range_dates[1])
    )
    filtered_df = filt_df.loc[mask]
    filtered_df.sort_values([date_column_name], ascending=True, inplace=True)
    return filtered_df

In [None]:
def grouped_df(df, time_column_name, range_dates, grouper, grouper_metric):
    df[time_column_name] = pd.to_datetime(df[time_column_name])
    mask = (df[time_column_name] >= pd.to_datetime(range_dates[0])) & (
        df[time_column_name] <= pd.to_datetime(range_dates[1])
    )
    filtered_df = df.loc[mask]
    df = filtered_df.sort_values([time_column_name], ascending=True)

    if range_dates[0] != range_dates[1]:
        if grouper_metric == "sum":
            grouped_df = filtered_df.groupby(grouper).sum().reset_index()
        if grouper_metric == "mean":
            grouped_df = filtered_df.groupby(grouper).mean().reset_index()
        if grouper_metric == "count":
            grouped_df = filtered_df.groupby(grouper).count().reset_index()

        return grouped_df

    return df

def plot_map(df, time_column_name, range_dates, grouper):
    grouper_metric = "sum"
    df = grouped_df(df, time_column_name, range_dates, grouper, grouper_metric)
    df["id"] = df["Country_Name"].apply(lambda x: state_id_map[x])
    fig = px.choropleth_mapbox(
        df,
        locations="id",
        geojson=world_json,
        color="N° of Registrations",
        hover_name="Country_Name",
        hover_data={'N° of Registrations':True, 'id': False},
        # cmap = 'salmon',
        title="Alumni Registrations around the Globe",
        color_continuous_scale="thermal",  # aggrnyl - bluered - cividis - darkmint - deep - thermal
        mapbox_style="white-bg",  # open-street-map - carto-positron - white-bg - stamen-toner
        zoom=0.3,
        opacity=0.9,
    )

    return fig


# PART 1. **Exploratory Data Analysis (EDA)**

## **Student Info**

In [None]:
cc = coco.CountryConverter()
merge_1 = student_info.groupby([ 'student_country', 'date_registered']).count().reset_index()
merge_1.columns.name = merge_1.index.name
merge_1.index.name = None
merge_1['student_country'] = cc.pandas_convert(series= merge_1['student_country'], to='ISO3')
merge_1['student_country'].replace(to_replace="XKX", value="-99", inplace=True)

In [None]:
student_info_map = pd.merge(
    merge_1,
    world_df,
    how="outer",
    left_on="student_country",
    right_on="ISO_A3",
).dropna(subset=["student_id", "geometry"]).drop(columns='student_country')
student_info_map.columns = ['Registration_Date', 'N° of Registrations', 'Country_Name', 'Country_ISO_A3', 'geometry']
student_info_map["id"] = student_info_map["Country_Name"].apply(lambda x: state_id_map[x])


student_info_map.to_csv('map_df.csv')

In [None]:
id_count = 0
for feature in world_json["features"]:
    feature["id"] = id_count
    id_count += 1

In [None]:
student_info_map

In [None]:
def filter_by_dates(filt_df, range_dates, date_column_name):
    filt_df[date_column_name] = pd.to_datetime(filt_df[date_column_name])
    mask = (filt_df[date_column_name] >= pd.to_datetime(range_dates[0])) & (
        filt_df[date_column_name] <= pd.to_datetime(range_dates[1])
    )
    filtered_df = filt_df.loc[mask]
    filtered_df.sort_values([date_column_name], ascending=True, inplace=True)
    return filtered_df

In [None]:
date_column_name = 'Registration_Date'
range_dates = ["2022-01-01", "2022-10-20"]
colors = "N° of Registrations"
hover_names =  "Country_Name"

filtered_df = filter_by_dates(student_info_map, range_dates, date_column_name)
df_map = filtered_df.groupby(["Country_Name", "Country_ISO_A3"]).sum().reset_index()
df_map["id"] = df_map["Country_Name"].apply(lambda x: state_id_map[x])
df_map

fig = px.choropleth_mapbox(
        df_map,
        locations="id",
        geojson=world_json,
        color=colors,
        hover_name=hover_names,
        hover_data={colors:True, 'id': False},
        title="Alumni Registrations around the Globe",
        color_continuous_scale="thermal",  # aggrnyl - bluered - cividis - darkmint - deep - thermal
        mapbox_style="white-bg",  # open-street-map - carto-positron - white-bg - stamen-toner
        zoom=0,
        opacity=0.9,
    )

fig

In [None]:
df_map

## **Courses DF**

In [None]:
def courses_popularity(range_dates):
    course_rating_1 = filter_by_dates(course_ratings, range_dates, 'date_rated').groupby('course_id').mean().reset_index().drop(columns='student_id')
    course_rating_2 = filter_by_dates(student_learning, range_dates, 'date_watched').groupby('course_id').sum().reset_index().drop(columns='student_id')
    course_rating_3 = filter_by_dates(student_learning, range_dates, 'date_watched').groupby('course_id').mean().reset_index().drop(columns='student_id')
    course_rating_4 = filter_by_dates(student_learning, range_dates, 'date_watched').groupby('course_id').count().reset_index().drop(columns='student_id')
    df_to_merge =[
        course_rating_1,
        course_rating_2,
        course_rating_3,
        course_rating_4,
        course_info
    ]

    course_info_names = reduce(lambda  left,right: pd.merge(left,right,on=['course_id'], how='outer'), df_to_merge).drop(columns='date_watched')
    course_info_names.columns = ['course_id', 'avg_course_rating', 'tot_minutes_watched', 'avg_minutes_watched', 'times_watched', 'course_title']
    course_info_names['intro_word'] =  course_info_names['course_title'].str.contains(pat ='Intro[a-z]', regex = True)
    course_info_names['ml'] =  course_info_names['course_title'].str.contains(pat ='Mach[a-z]', regex = True)
    course_info_names['popularity_1'] = course_info_names['times_watched'] / course_info_names['avg_course_rating']
    course_info_names['popularity_2'] = course_info_names['times_watched'] * course_info_names['avg_course_rating']
    course_info_names['popularity_3'] = course_info_names['popularity_2'] / course_info_names['tot_minutes_watched']
    course_info_names['popularity_normal']=(course_info_names['popularity_1']-course_info_names['popularity_1'].min())/(course_info_names['popularity_1'].max()-course_info_names['popularity_1'].min())
    course_info_names.sort_values(by='popularity_2', ascending=False)
    return course_info_names.sort_values(by='popularity_2', ascending=False)

In [None]:
range_dates = ["2022-01-01", "2022-10-20"]
courses_rating_df = courses_popularity(range_dates)

## **Dates DF**

In [None]:
reg_dates = student_info.groupby('date_registered').count().reset_index().iloc[:,:2]
reg_dates.columns=['date', 'students_registered']
reg_dates['date'] = pd.to_datetime(reg_dates['date'])
reg_dates

In [None]:
rating_dates = course_ratings.groupby('date_rated').count().reset_index().iloc[:,:2]
rating_dates.columns=['date', 'courses_rated']
rating_dates['date'] = pd.to_datetime(rating_dates['date'])
rating_dates

In [None]:
watching_date_a = student_learning.groupby('date_watched').sum().reset_index().drop(columns=['student_id','course_id'])
watching_date_a

In [None]:
watching_date_b = student_learning.groupby('date_watched').count().reset_index().drop(columns=['student_id','minutes_watched'])
watching_date_b

In [None]:
watching_date = pd.merge(
    watching_date_a,
    watching_date_b,
    how='outer',
    left_on = 'date_watched',
    right_on  = 'date_watched'
)
watching_date.columns = ['date', 'tot_minutes_watched', 'tot_courses_watched']
watching_date['date'] = pd.to_datetime(watching_date['date'])
watching_date

In [None]:
exams_dates_a = student_exams.groupby('date_exam_completed').mean().reset_index().drop(columns=['exam_attempt_id', 'student_id', 'exam_id'])
exams_dates_a

In [None]:
exams_dates_b = student_exams.groupby('date_exam_completed').count().reset_index().iloc[:,:2]
exams_dates_b

In [None]:
exams_dates = pd.merge(
    exams_dates_a,
    exams_dates_b,
    how='outer',
    left_on = 'date_exam_completed',
    right_on  = 'date_exam_completed'
)
exams_dates.columns = ['date', 'avg_exam_results', 'avg_exam_completion_time', 'tot_exams_presented']
exams_dates['date'] = pd.to_datetime(exams_dates['date'])
exams_dates

In [None]:
questions_dates = student_hub_questions.groupby('date_question_asked').count().reset_index().iloc[:,:2]
questions_dates.columns=['date', 'questions_posted']
questions_dates['date'] = pd.to_datetime(questions_dates['date'])
questions_dates

In [None]:
engagement_dates = student_engagement.groupby('date_engaged').sum().reset_index().drop(columns=['engagement_id', 'student_id'])
engagement_dates.columns= ['date', 'engagement_quizzes', 'engagement_exams', 'engagement_lessons']
engagement_dates['date'] = pd.to_datetime(engagement_dates['date'])
engagement_dates

In [None]:
purchases_dates_a = student_purchases.groupby(['date_purchased', 'purchase_type']).count().reset_index()
purchases_dates = purchases_dates_a.pivot('date_purchased', 'purchase_type', 'purchase_id').reset_index()
purchases_dates.columns = ['date', 'annual_purchases', 'monthly_purchases', 'quarterly_purchases']
purchases_dates['date'] = pd.to_datetime(purchases_dates['date'])
purchases_dates

In [None]:
def categorize_weekdays(df, cat_column):
    days_cats = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    cat_type = CategoricalDtype (categories=days_cats, ordered=True)
    df[cat_column] = df[cat_column].astype(cat_type)
    return df


dates_df = [
    reg_dates,
    rating_dates,
    watching_date,
    exams_dates,
    questions_dates,
    engagement_dates,
    purchases_dates,
]
    
dates_dataframe = reduce(
    lambda left, right: pd.merge(left, right, on=["date"], how="outer"), dates_df
)
dates_dataframe['weekday'] = dates_dataframe['date'].apply(lambda x: calendar.day_name[x.weekday()])
dates_dataframe = categorize_weekdays(dates_dataframe, 'weekday')
dates_dataframe

dates_dataframe.to_csv('dates_df.csv')

In [None]:
weekday_purchases = dates_dataframe.groupby('weekday')['annual_purchases', 'monthly_purchases','quarterly_purchases' ].sum().reset_index()
weekday_purchases

In [None]:
weekday_engagement = dates_dataframe.groupby('weekday')['questions_posted', 'engagement_lessons', 'engagement_exams','engagement_quizzes' ].sum().reset_index()
weekday_engagement

In [None]:
weekday_registrations = dates_dataframe.groupby('weekday')['students_registered'].sum().reset_index()
weekday_registrations

In [None]:
weekday_time_watched = dates_dataframe.groupby('weekday')['tot_minutes_watched', 'tot_courses_watched'].sum().reset_index()
weekday_time_watched

In [None]:
weekday_exams_a = dates_dataframe.groupby('weekday')['avg_exam_results', 'avg_exam_completion_time'].mean().reset_index()
weekday_exams_b = dates_dataframe.groupby('weekday')['tot_exams_presented'].sum().reset_index()
weekday_exams = pd.merge(
    weekday_exams_a,
    weekday_exams_b,
    how='outer',
    on='weekday'
)
weekday_exams

## **Students DF**

In [None]:
student_courses = student_learning.groupby('student_id').count().reset_index()[["student_id", "course_id"]]
student_exams_count = student_exams.groupby('student_id').count().reset_index().iloc[:,:2]
student_exams_count.columns=['student_id', 'exams_attempts']
student_exams_avg = student_exams.groupby('student_id').mean().reset_index().drop(columns=['exam_attempt_id', 'exam_id'])
student_exams_avg.columns=['student_id', 'avg_exam_result', 'avg_exam_completion_time']
student_quizzes_1  = student_quizzes.groupby('student_id').count().reset_index().iloc[:,:2]
student_quizzes_1.columns=['student_id', 'quizz_quesitons_answered']
student_hub_questions_1 = student_hub_questions.groupby('student_id').count().reset_index().iloc[:,:2]
student_hub_questions_1.columns=['student_id', 'questions_posted']
student_engagement_1 = student_engagement.groupby('student_id').sum().reset_index().drop(columns='engagement_id')
student_purchases_0 = student_purchases.groupby(['student_id']).count().reset_index().iloc[:,:2]
student_purchases_0.columns = ['student_id', 'tot_purchases' ]
student_quizzes_1  = student_quizzes.groupby('student_id').count().reset_index().iloc[:,:2]
student_quizzes_1.columns=['student_id', 'quizz_quesitons_answered']


In [None]:
first_watch = student_learning.groupby('student_id').min().reset_index()[['student_id','date_watched']].rename(columns={"date_watched": "first_watched"})
first_exam = student_exams.groupby('student_id').min().reset_index()[['student_id','date_exam_completed']].rename(columns={"date_exam_completed": "first_exam"})
first_question = student_hub_questions.groupby('student_id').min().reset_index()[['student_id','date_question_asked']].rename(columns={"date_question_asked": "first_question"})
first_engagement = student_engagement.groupby('student_id').min().reset_index()[['student_id','date_engaged']].rename(columns={"date_engaged": "first_engagement"})
first_purchase = student_purchases.groupby('student_id').min().reset_index()[['student_id','date_purchased']].rename(columns={"date_purchased": "first_purchase"})

In [None]:
students_dfs = [
    student_info,
    student_courses,
    student_exams_count,
    student_exams_avg,
    student_quizzes_1,
    student_hub_questions_1,
    student_engagement_1,
    student_purchases_0,
    student_quizzes_1,
    first_watch,
    first_exam,
    first_question,
    first_engagement,
    first_purchase,
]

student_all_info = reduce(lambda  left,right: pd.merge(left,right,on=['student_id'], how='outer'), students_dfs).drop(columns='tot_purchases')
student_all_info

In [None]:
student_all_info['date_registered'] = pd.to_datetime(student_all_info['date_registered'])
student_all_info['first_watched'] = pd.to_datetime(student_all_info['first_watched'])
student_all_info['first_exam'] = pd.to_datetime(student_all_info['first_exam'])
student_all_info['first_question'] = pd.to_datetime(student_all_info['first_question'])
student_all_info['first_engagement'] = pd.to_datetime(student_all_info['first_engagement'])
student_all_info['first_purchase'] = pd.to_datetime(student_all_info['first_purchase'])

In [None]:
s_df = student_all_info
s_df['watched_before_purchase'] = s_df.apply(lambda x : True if x['first_watched'] < x['first_purchase'] and x['first_watched'] != pd.NaT else False, axis=1)
s_df['exam_before_purchase'] = s_df.apply(lambda x : True if x['first_exam'] < x['first_purchase'] and x['first_exam'] != pd.NaT else False, axis=1)
s_df['questioned_before_purchase'] = s_df.apply(lambda x : True if x['first_question'] < x['first_purchase'] and x['first_question'] != pd.NaT else False, axis=1)
s_df['engaged_before_purchase'] = s_df.apply(lambda x : True if x['first_engagement'] < x['first_purchase'] and x['first_engagement'] != pd.NaT else False, axis=1)
s_df['time_to_purchase'] = s_df['first_purchase'] - s_df['date_registered']
s_df['time_to_date'] = s_df['date_registered'].max() - s_df['date_registered']
s_df["first_purchase"].fillna(0, inplace=True)
s_df['purchased_yes_no']= s_df["first_purchase"] != 0
s_df


In [None]:
def datetime_to_float(d):
    try:
        if isinstance(d, float) or isinstance(d, int):
            return d
    except:   
        None     
    epoch = datetime.utcfromtimestamp(0)
    total_seconds =  (d - epoch).total_seconds()
    # total_seconds will be in decimals (millisecond precision)
    return total_seconds


# d = s_df['first_purchase'].values[2]
# datetime_to_float(d)

In [None]:
first_purchase = s_df['first_purchase'].values
purchases = []
for n in first_purchase:
    purchases.append(datetime_to_float(n))

s_df['first_purchase'] = purchases

In [None]:
date_columns = [
    'date_registered',
    'first_watched',
    'first_exam',
    'first_question',
    'first_engagement',
    'time_to_date',
    'time_to_purchase'
]

for c in date_columns:
    s_df[c] =  s_df[c].values.astype("float")

In [None]:
count_purchases = s_df['purchased_yes_no'].value_counts()
count_purchases

# PART 2. **Feautre Engineering**

In [None]:
s_df_1 = s_df.copy().drop(
    columns=[
        "engaged_before_purchase",
        "questioned_before_purchase",
        "time_to_purchase",
        "watched_before_purchase",
        "first_purchase",
        "exam_before_purchase",
        "first_exam"
    ]
)
s_df_1.columns


In [None]:
sub_df = s_df_1[['student_id', 'student_country']]
countries_encode = pd.pivot_table(sub_df, values='student_id', columns=['student_country'], index='student_id', aggfunc=np.count_nonzero).reset_index().fillna(0)
countries_encode

In [None]:
s_df_hot = reduce(lambda  left,right: pd.merge(left,right,on=['student_id'], how='outer'), [s_df_1, countries_encode]).drop(columns='student_country')
s_df_hot

In [None]:
# creating instance of Label Encoder
labelencoder = LabelEncoder()
s_df_label = s_df_1.copy()
s_df_label["student_country"] = labelencoder.fit_transform(s_df_label["student_country"])
s_df_label 

# PART 3. **Feature Selection**

In [None]:
corr_thre = 0.4

## Correlation with Encoded

In [None]:
# Let's eliminate predictors with a weak correlation with Critical Temperature (Y)
corr_matrix = s_df_hot.corr().abs()
to_drop = corr_matrix.loc[corr_matrix['purchased_yes_no'] < corr_thre]
to_drop_names = []
for row in to_drop.index:
    to_drop_names.append(row)

s_df_hot_0 = s_df_hot.drop(to_drop_names, axis=1)
print(s_df_hot_0.shape)
s_df_hot_0

In [None]:
s_df_hot_0.columns

## Correlation with Labeled

In [None]:
# Let's eliminate predictors with a weak correlation with Critical Temperature (Y)
corr_matrix = s_df_label.corr().abs()
to_drop = corr_matrix.loc[corr_matrix['purchased_yes_no'] < corr_thre]
to_drop_names = []
for row in to_drop.index:
    to_drop_names.append(row)

s_df_label_0 = s_df_label.drop(to_drop_names, axis=1)
print(s_df_label_0.shape)
s_df_label_0

In [None]:
s_df_label_0.columns

s_df_label_0.columns == s_df_hot_0.columns

# PART 4. **Data Pre-processing**

In [None]:
new_df = s_df_hot_0.copy().astype(float).fillna(0)
new_df

## Splitting in training and evaluation sets

In [None]:
# set the seed to keep the random state I used my matricule Number
seedNum = 931221
val_size = 0.3

# Let's divide the data sets in predictors and response variables
scaler = StandardScaler()


Y_set= new_df[['purchased_yes_no']]
X_set_0 = new_df.drop(columns=['purchased_yes_no'])
X_set = pd.DataFrame(scaler.fit_transform(X_set_0.values), columns=X_set_0.columns, index=X_set_0.index)

# for red sets
X_train, X_eval, Y_train, Y_eval = train_test_split(X_set, Y_set, test_size=val_size, random_state=seedNum)

# Data set Predictors scal

# PART 5. **Model construction and Evaluation**


In [None]:
# Since is a classification, the scroting for our models is Accuracy of the model.
scoring = 'accuracy'

## Models Comparison

In [None]:
# Set up Classification Algorithms Array
models = [
    ['Linear DA', LinearDiscriminantAnalysis()],
    ['Log Regression', LogisticRegression()],
    ['KNN', KNeighborsClassifier()],
    ['Class Tree', DecisionTreeClassifier(random_state = seedNum)],
    ['SVM', SVC(random_state = seedNum)],
    ['Naive Bayes', GaussianNB()],
    ['Bagging', BaggingClassifier(random_state = seedNum)],
    ['RF', RandomForestClassifier(random_state = seedNum)],
    ['Extra Trees', ExtraTreesClassifier(random_state = seedNum)],
    ['Ada-Booster', AdaBoostClassifier(random_state = seedNum)],
    ['Grad-Booster', GradientBoostingClassifier(random_state = seedNum)],
    ['XG Booster', xgb.XGBClassifier(random_state = seedNum) ]
]
# Run algorithms using n-fold cross validation
folds = 10

ModelsResults = []
ModelsNames = []
ModelsMeans = []
models_dict = {}

from datetime import datetime

for name, model in models:
    startTimeModule = datetime.now()
    kfold = KFold(n_splits=folds, shuffle=True, random_state=seedNum)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    ModelsResults.append(cv_results)
    ModelsNames.append(name)
    ModelsMeans.append(cv_results.mean())
    models_dict[name] = ([cv_results.mean(), cv_results.std()])
    msg = "%s:\n  mean: %f  -  std: %f" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    print("Model training time:", (datetime.now() - startTimeModule))
# print('\nAverage (' + scoring + ') from all models:', np.mean(ModelsMeans))


## Model Tuning (Gradient Booster)

In [None]:
# Tuning algorithm #2 - Random Forest
startTimeModule = datetime.now()
# define models and parameters
model = GradientBoostingClassifier(random_state=seedNum)
# define grid search
grid = {
    "n_estimators": [5, 50, 250, 500],
    "max_depth": [1, 3, 5, 7, 9],
    "learning_rate": [0.01, 0.1, 1, 10, 100],
}

cv = RepeatedStratifiedKFold(n_splits=folds, n_repeats=3, random_state=seedNum)
grid2 = GridSearchCV(
    estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring=scoring, error_score=0
)
grid_result2 = grid2.fit(X_train, Y_train)

print("Best: %f using %s" % (grid_result2.best_score_, grid_result2.best_params_))
means = grid_result2.cv_results_["mean_test_score"]
stds = grid_result2.cv_results_["std_test_score"]
params = grid_result2.cv_results_["params"]
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
print("Model training time:", (datetime.now() - startTimeModule))


# PART 6 - Select and Finalize Model

In [None]:
# Grad-Booster --> Best: 0.971304 using {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500}

model = GradientBoostingClassifier(random_state = seedNum)
model.fit(X_train, Y_train)
model_Eval = model.predict(X_eval)


print(accuracy_score(Y_eval, model_Eval))
print(confusion_matrix(Y_eval, model_Eval))
print(classification_report(Y_eval, model_Eval))