# STEP 4.1: Modeling Random Forest

## Description of the methodology
> * Finalize bining of Target Variable
* Create Train and Test datasets
* Create a Random Forest pipeline
* Define key parameters
* Run the model on sub-train data set and test accuracy on the validation data set
* Select 2 mots accurate models based on the hyper-parameters, run it to get the confusion matrix
* Select Best RF model candidate and apply it to the main train/test dataset

## Import libraries

In [48]:
import numpy as np
import pandas as pd
import seaborn as sns
from collections import OrderedDict as OrderedDict

%matplotlib inline
import matplotlib.pyplot as plt

import re
from sklearn.preprocessing import Normalizer
import os
from sklearn import preprocessing
from scipy.stats import kurtosis
from scipy.stats import skew
from scipy import stats
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import ComplementNB, MultinomialNB

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ParameterGrid

from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier

from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
import scikitplot as skplt

# t_NSE dimensionality reduction
from sklearn.manifold import TSNE

import random
from sklearn import ensemble

from sklearn.model_selection import StratifiedShuffleSplit

import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")


# Activate Seaborn style
sns.set()

## Load the file for analysis

In [2]:
# Importing the file and creating a dataframe
master_modeling = pd.read_csv(
    "C:/Users/fbaff/EPFL ML Python/5. Captsone/master_modeling.csv",
    low_memory=False,
    skipinitialspace=True,
)

In [3]:
# display all columns
pd.set_option("display.max_columns", None)

In [4]:
# remove the Unnamed column
master_modeling.drop("Unnamed: 0", axis=1, inplace=True)
master_modeling.shape

(194484, 351)

In [5]:
# Create a dataframe for the modeling phase (without text and not relevant features)
df_modeling = master_modeling.drop(["Title", "Post_ID", "Snippet"], axis=1)

In [6]:
df_modeling.shape

(194484, 348)

## Definition of # of classes for the Target Variable 'All_Impact'

> * We will split the variable in 3 classes using Scikit Learn preprocessing function KBinDiscretizer with the following parameters: number of bins 3, encode: ordinal and strategy: quantile
* Oridinal has been selected as we are trying to model a hierarchy between low and high tweet impact
* Quantile implies an even number of data points per class which would shape the model to learn about features for each class equally (avoiding unbalance classes)
* We may reconsider some of the value of the parameters depending on the modeling results

In [7]:
ai_bin = master_modeling[["ALL_Impact"]]

In [8]:
# Process binizer
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy="quantile")
est.fit(ai_bin)
new_ai = est.transform(ai_bin)

In [9]:
# Call the edge of the different 3 bins
est.bin_edges_[0]

array([ 0., 30., 41., 80.])

In [10]:
new_ai_df = pd.DataFrame(new_ai)

In [11]:
new_ai_df.shape

(194484, 1)

In [12]:
df_modeling["All_impact bin"] = new_ai_df

In [13]:
df_modeling["All_impact bin"].value_counts()

2.0    69658
1.0    63049
0.0    61777
Name: All_impact bin, dtype: int64

In [14]:
# Remove the original All_Impact feature
df_modeling2 = df_modeling.drop(
    ["ALL_Impact", "TW_Hashtags", "ALL_Author", "TW_Account_Name"], axis=1
)

In [15]:
# Transform new All Impact feature type into int64
df_modeling2["All_impact bin"] = df_modeling2["All_impact bin"].astype(np.int64)

In [16]:
df_modeling2.shape

(194484, 345)

In [17]:
df_modeling.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,Sentiment,Company_Clarivate,Company_Informa,Company_Pearson,Company_RELX Group,Company_Thomson Reuters,Company_Wolters Kluwer,Country 2_Argentina,Country 2_Australia,Country 2_Belgium,Country 2_Brazil,Country 2_Canada,Country 2_Ecuador,Country 2_France,Country 2_Germany,Country 2_Hong Kong,Country 2_India,Country 2_Italy,Country 2_Japan,Country 2_Mexico,Country 2_Netherlands,Country 2_Other,Country 2_Philippines,Country 2_Russia,Country 2_Serbia,Country 2_Singapore,Country 2_South Africa,Country 2_Spain,Country 2_Switzerland,Country 2_United Arab Emirates,Country 2_United Kingdom,Country 2_United States,Country 2_Venezuela,ALL_Thread_Entry_Type_post,ALL_Thread_Entry_Type_reply,ALL_Thread_Entry_Type_share,TW_Account_Type_Not identified,TW_Account_Type_individual,TW_Account_Type_organisational,ALL_Impact,Log_TW_KredOutreach,Log_Nbreach,Log_TW_NbFollowers,Log_TW_NbFollowing,Log_TW_NbTweets,TW_Hashtags,ALL_Author,TW_Account_Name,All_impact bin
0,-0.043355,-0.026154,0.05633,0.096232,-0.079631,-0.052836,0.048258,-0.210815,0.129344,0.028728,-0.051351,-0.202983,-0.097575,-0.06958,0.023331,0.098674,-0.010539,0.010712,-0.056834,-0.094849,-0.026164,-0.002116,0.283895,-0.005615,-0.053263,0.018941,-0.010457,0.144206,0.044637,-0.004079,0.004364,0.031087,-0.100098,-0.137032,-0.058838,-0.054138,-0.043732,0.082092,0.078893,-0.026632,-0.063639,-0.10498,0.150564,0.134542,0.175049,-0.071198,-0.067627,-0.100016,-0.217041,0.096486,-0.092585,0.039103,0.064535,0.061666,0.087718,-0.012533,-0.117382,-0.018209,0.04542,-0.249552,0.033651,-0.04468,-0.068578,-0.04014,-0.078196,-0.103747,-0.085938,-0.124268,0.084961,0.139425,-0.037394,0.049886,-0.062134,-0.060832,-0.164551,0.026286,-0.033651,0.038818,-0.057373,0.061015,-0.099528,0.092122,0.036601,-0.060507,0.082275,-0.030721,-0.009196,0.103404,0.001943,0.038981,0.123454,0.011963,-0.078763,-0.101461,-0.139242,0.086426,-0.044942,-0.051346,0.214478,0.051982,-0.033285,0.013041,0.002767,0.002828,0.073032,0.163574,-0.208537,0.020345,0.135742,-0.006714,-0.044881,-0.06189,0.051839,-0.026937,0.04598,0.052348,-0.0236,-0.276123,0.043335,0.084157,0.105428,-0.003113,-0.206991,0.110575,0.118327,-0.052622,-0.067342,0.039185,0.058289,-0.039581,-0.196574,0.026133,0.011617,0.023051,-0.109202,0.043915,-0.01119,-0.012197,0.048096,0.011353,0.2889,-0.083293,-0.062876,-0.053426,0.06427,-0.034261,-0.005025,-0.016693,-0.088582,-0.205322,0.116984,-0.101969,-0.003977,-0.069417,0.023163,-0.142741,-0.004181,-0.02653,0.065626,-0.038544,-0.201538,0.103404,-0.099365,0.042969,-0.020793,-0.14034,0.140372,-0.044189,-0.116414,0.194603,-0.135277,0.025309,0.029427,-0.008776,0.021322,-0.011882,0.162964,-0.080648,0.194173,-0.153809,-0.049357,-0.024295,-0.039917,-0.067261,0.164591,0.001709,-0.085144,0.081889,0.068522,0.031911,0.046265,0.115234,-0.217855,-0.105306,0.009359,0.091675,-0.076823,0.013916,-0.052916,-0.074188,-0.033691,0.140869,-0.030874,-0.009588,0.016388,0.022868,0.050496,0.015767,-0.088623,0.05835,0.028239,0.061117,-0.135228,-0.064412,0.005178,-0.021576,0.137166,-0.050954,-0.035294,0.005249,-0.059123,0.061017,0.051758,0.049927,-0.085276,-0.061554,-0.017375,0.044271,-0.014038,-0.086426,0.107076,0.040548,-0.073354,-0.019613,0.024592,-0.007266,0.072306,0.013509,0.056966,0.037781,0.123942,0.00885,0.008362,-0.000977,-0.041504,-0.034953,0.082886,0.124756,-0.12561,-0.190776,-0.022278,0.061839,0.049581,0.134206,0.148641,-0.135742,0.031158,0.124959,-0.06665,-0.042318,-0.003052,0.022003,0.155924,-0.134644,0.012227,0.106415,-0.092285,-0.039612,0.009603,0.047201,-0.021126,-0.047668,0.055562,0.008235,0.174601,0.015889,0.009638,-0.065643,-0.065084,0.138672,-0.088623,0.075439,-0.124858,0.071592,0.053894,0.044313,-0.137777,0.157715,0.072449,0.021647,0.083008,0.081746,0.038116,0.093424,0.03359,-0.137166,-0.109904,-0.055583,-0.098694,-0.025431,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,28,0.0,6.052089,4.043051,5.09375,7.409136,2017btsfesta,titina_joner,tw account not identified,0.0
1,0.062174,-0.016751,0.080119,0.042372,-0.120394,-0.103285,-0.096608,-0.03989,0.037337,-0.032766,0.028105,-0.153741,-0.09413,0.022027,-0.071482,-0.011108,-0.013699,0.116313,-0.124186,-0.075311,0.020711,-0.058485,0.194499,-0.055562,0.021705,-0.109795,-0.057237,0.024563,0.042552,-0.051704,-0.015205,-0.016233,-0.065274,-0.036608,-0.034709,-0.091634,-0.148071,0.003933,-0.090956,0.09709,-0.032322,-0.100132,0.176303,-0.04168,0.032715,-0.021864,-0.086222,0.038954,-0.102376,-0.015225,-0.069756,0.013082,0.105984,0.00885,-0.024489,-0.117432,-0.140402,0.083211,-0.08983,-0.174052,0.065776,-0.00765,-0.035333,-0.020222,-0.063972,-0.043835,-0.057292,0.025688,-0.014404,0.032939,-0.044005,0.163601,0.068502,0.020111,-0.07921,-0.042082,-0.054253,0.024417,0.028049,-0.008803,-0.091295,0.044915,0.014499,0.04444,0.064602,0.002992,-0.083442,0.126394,0.094445,0.018419,0.009772,0.035136,-0.071571,-0.076823,-0.079915,0.030599,0.074232,-0.063477,0.212023,0.019463,-0.023383,-0.095744,0.061964,0.021362,0.0503,-0.029961,-0.145989,0.000188,0.200928,-0.01633,-0.053699,-0.075765,-0.044156,0.034451,-0.007209,0.007704,0.083679,-0.012804,0.059699,0.046251,-0.054959,0.059998,-0.216166,0.022366,0.051676,-0.096489,-0.057265,-0.034846,0.028727,-0.019931,-0.085795,0.005164,0.017819,-0.031386,-0.010615,-0.002547,-0.056986,0.045627,0.051439,0.078362,0.110548,-0.084496,0.07761,0.091607,-0.032996,0.047377,-0.034858,-0.051805,-0.079793,-0.083378,0.150011,-0.073107,-0.121585,-0.012478,0.027656,-0.062822,-0.024272,-0.102905,-0.051975,-0.023164,-0.190809,0.129924,0.046678,-0.003092,-0.100179,-0.093669,0.057292,-0.047797,-0.021681,0.064168,-0.090251,0.151048,-0.054837,0.012478,-0.058757,0.054199,0.029473,-0.028904,-0.003152,-0.108188,-0.082987,-0.154588,-0.089762,-0.082418,0.111226,0.024061,-0.054118,0.05405,0.014988,0.085395,0.050117,0.02889,-0.066603,-0.016398,0.091539,0.080098,-0.09762,0.049981,-0.094672,-0.032235,0.027303,0.08846,0.05759,-0.003974,0.018053,-0.060452,0.029351,0.082648,-0.045213,-0.01052,0.071452,0.032315,-0.012261,-0.031019,-0.12736,0.060023,0.043121,0.030952,-0.147651,-0.020718,-0.153849,0.042013,0.033189,0.031711,0.058556,-0.107727,0.099962,0.140815,0.003347,0.050496,0.038591,-0.003052,-0.030599,0.005941,0.086928,-0.151598,0.137112,-0.047292,0.124308,-0.011943,0.043047,0.029912,-0.056634,-0.078559,0.008586,0.158098,0.031715,0.10536,-0.048394,-0.185371,0.088294,0.030467,-0.041006,0.192561,0.13267,-0.038194,0.107279,0.047092,-0.034917,-0.058675,0.056722,-0.017042,0.149902,0.065286,0.030535,-0.014648,-0.105659,-0.004008,0.031223,-0.032511,0.034912,0.082954,0.102146,-0.021088,0.057034,0.029705,-0.089681,-0.048299,-0.004015,0.097629,-0.077728,0.071031,-0.184252,0.097553,0.007589,-0.013075,-0.156467,0.078722,0.075358,0.077284,0.047272,0.132711,0.009962,0.031087,-0.015971,-0.003174,-0.017293,-0.091109,0.025137,-0.040622,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,25,0.0,5.940171,3.295837,4.94876,8.307706,No TW hashtag,dasCameo1,tw account not identified,0.0
2,-0.005493,-0.046224,-0.041031,0.039266,-0.156698,0.102946,0.091634,-0.114136,0.035787,0.029237,-0.07019,-0.110189,-0.139811,0.046875,-0.137965,0.136637,0.13265,0.043569,-0.008219,-0.111776,-0.036621,-0.010579,0.226237,-0.059814,0.108195,-0.10907,-0.124105,0.11613,-0.040324,0.021962,0.047302,0.032003,-0.023468,-0.054362,-0.01945,-0.021037,-0.046631,0.149658,-0.017619,0.03658,0.000163,-0.017944,-0.031158,0.001811,0.051025,-0.060048,-0.051961,-0.07723,-0.097722,0.016937,-0.066325,-0.012329,0.069517,0.035116,0.03478,-0.036535,-0.018424,-0.049011,-0.004659,-0.108618,-0.092896,0.052488,-0.055745,-0.026245,-0.115885,-0.088867,-0.083206,-0.038005,-0.043213,0.082052,0.007853,0.028137,0.06842,-0.018188,-0.125366,0.02889,-0.047038,0.097249,0.049438,0.085449,-0.035116,0.043783,0.017253,-0.03353,0.036051,0.019368,-0.002686,0.045741,-0.099609,-0.028239,0.037282,-0.048869,-0.063853,0.007222,0.030436,-0.004842,-0.002604,0.083659,0.107605,-0.021207,-0.061615,0.017141,0.03418,0.029683,0.012526,0.042427,-0.126465,0.021342,0.04128,-0.003894,-0.093953,-0.065308,0.026367,0.046895,0.025065,0.051971,-0.033732,-0.051707,0.129374,0.061035,0.02596,-0.017832,-0.104167,-0.005188,-0.082194,-0.022909,-0.034485,0.019012,0.137858,0.030426,-0.060527,-0.097707,0.058024,-0.041321,-0.021011,-0.081706,0.082845,-0.010966,0.043193,0.052958,0.082397,-0.08256,-0.038488,0.058573,-0.042765,0.000732,-0.061991,-0.068532,0.015747,-0.024292,0.074849,0.03182,-0.09225,-0.020162,0.084524,-0.147868,-0.062541,0.020198,-0.021729,-0.030426,-0.085693,0.051463,-0.010905,0.03068,0.065308,-0.189209,0.003206,-0.046076,-0.103394,0.090983,-0.017095,0.004924,0.033162,-0.115377,-0.037476,0.015483,0.017873,-0.157227,0.124105,-0.126567,-0.146362,-0.134094,0.007182,-0.084229,0.07548,-0.031128,-0.122884,0.020467,0.027629,0.117645,0.052917,0.036031,-0.062907,-0.020345,0.006104,0.075867,0.072021,-0.011475,0.079956,-0.119954,0.015035,0.131185,-0.037038,-0.102656,0.012527,-2e-05,-0.048421,0.088994,-0.032928,-0.013875,0.068014,-0.049927,-0.080343,-0.006144,-0.068746,0.037667,0.116577,-0.071228,-0.02681,0.022125,-0.042013,-0.005857,-0.082031,0.038574,-0.032837,0.036977,0.109833,-0.005208,0.034871,-0.023722,0.002177,-0.00765,-0.021139,-0.093669,0.015549,0.043538,0.039879,-0.027364,0.059896,-0.011678,0.000992,-0.040649,-0.019389,0.065816,-0.001719,-0.015747,-0.013428,0.040873,-0.040639,-0.069377,0.081657,0.058044,0.005249,0.117432,0.001017,-0.000264,-0.014323,0.067871,-0.020671,-0.075439,0.097026,-0.049601,0.070302,0.055786,-0.001801,0.117442,-0.045333,-0.012736,0.039469,0.001149,-0.018555,0.013855,0.096029,0.020218,0.075846,-0.077474,-0.045939,-0.027669,-0.0071,0.13859,-0.054138,0.027913,-0.041967,0.01254,-0.050761,-0.002299,-0.036357,-0.02711,0.082621,0.015381,0.002625,0.026367,0.099396,0.053792,0.04601,-0.034454,0.017446,-0.03007,0.043457,0.081278,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,25,0.0,5.921578,3.135494,0.0,12.286371,"indeed, jobs",lumpyspace_tst2,tw account not identified,0.0
3,0.034058,-0.047404,0.014119,-0.056559,-0.054515,-0.00649,-0.003326,-0.080485,0.138346,-0.030263,-0.06012,-0.129862,0.015977,0.050049,-0.12542,-0.010376,0.087402,-0.001668,-0.061971,-0.094401,-0.003983,-0.011739,0.118225,0.113057,-0.011241,-0.026782,-0.072286,0.068011,-0.054321,-0.101929,-0.060425,0.097453,-0.058421,-0.060181,-0.028402,-0.018962,0.000183,-0.024658,0.182556,0.014852,-0.015625,-0.155111,0.067444,0.13501,-0.018148,-0.145508,-0.141538,-0.074992,-0.04071,0.1297,-0.04155,-0.003866,-0.134155,-0.006185,0.074931,0.120809,0.090902,-0.119643,0.101644,-0.278809,-0.02327,-0.145426,-0.127767,0.014847,-0.141357,0.116096,-0.040685,0.163086,0.05131,-0.020833,0.000651,0.038656,0.10909,0.056274,0.036662,-0.13501,0.173177,-0.001648,0.04777,0.042867,-0.000376,0.179932,0.028971,-0.080485,0.22111,-0.10053,-0.083669,0.044678,0.072825,0.065857,0.100016,-0.232422,-0.104762,-0.052999,-0.110026,-0.049316,-0.028941,-0.026886,0.08667,-0.050975,-0.037699,-0.076853,0.039302,-0.080729,-0.055893,-0.058716,-0.089254,0.033854,0.128062,-0.006266,-0.094218,-0.065043,-0.105754,0.087321,0.066366,0.071757,-0.018575,0.010579,0.08433,0.060598,-0.108067,-0.050863,-0.081533,0.03776,0.151693,-0.134115,-0.038045,0.126099,-0.011058,0.009421,-0.18219,0.015696,-0.063599,-0.038005,-0.198873,-0.048197,0.177531,0.087321,-0.027832,0.027425,0.062907,-0.031504,0.108826,-0.110575,-0.008031,-0.043432,-0.001465,-0.092855,0.064677,-0.066732,-0.019257,0.163859,-0.047719,-0.007812,0.065715,-0.017843,-0.023214,-0.034251,-0.024292,-0.148397,-0.087591,0.003743,-0.013021,0.099426,-0.027995,-0.118225,0.046631,-0.015864,-0.100952,0.055786,0.002808,-0.030111,0.000854,-0.0261,0.104655,-0.000992,0.109594,-0.123479,-0.149984,0.008138,-0.202881,-0.073079,0.032227,-0.105367,0.04126,-0.039968,-0.138021,0.154226,0.035797,-0.044647,-0.149801,-0.012451,-0.048991,-0.069743,-0.08225,0.074097,0.025508,0.002035,-0.090495,-0.193359,-0.03359,0.076864,0.04188,-0.001863,0.02356,0.022263,0.033488,0.060735,0.156667,-0.03007,0.041056,0.186727,-0.068985,-0.060099,-0.156148,0.000122,0.069865,0.035238,-0.099202,0.013509,-0.220357,0.09318,-0.028727,-0.059021,-0.006734,-0.141429,0.065959,-0.03393,0.06897,-0.030887,-0.142741,0.033223,-0.004842,-0.099325,0.047994,-0.082382,-0.091553,-0.07373,0.033061,0.07589,0.04776,-0.030111,-0.116648,-0.058116,-0.03713,-0.162435,-0.027201,0.134768,0.03361,-0.294027,0.055562,0.036784,-0.072001,0.126592,0.063838,-0.075948,-0.034203,-0.064128,-0.140625,-0.131999,-0.01416,-0.016856,0.046305,0.146535,-0.031982,-0.058634,0.040497,-0.056701,-0.164917,-0.027507,0.091838,0.16745,0.049459,-0.087626,0.018893,-0.041168,-0.044678,-0.025584,0.062459,0.109413,-0.176514,0.069519,0.017263,0.059255,-0.066406,0.060918,-0.163411,0.153905,0.163147,0.088826,0.003337,0.073853,-0.075114,0.035563,0.015666,0.000285,0.156169,-0.040385,-0.123149,-0.069814,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,64,0.0,7.832808,6.736967,5.043425,4.564348,"MiFIDII, TRRisk",mifidii,tw account not identified,2.0
4,0.015451,0.072963,-0.063298,0.152832,0.010882,-0.013,0.053467,-0.033273,0.140869,0.007952,0.001726,-0.156294,-0.061733,0.000837,-0.119629,0.020996,-0.040771,0.069789,-0.106934,-0.054827,0.025112,0.067383,0.080845,0.037702,-0.060117,0.17341,-0.004307,-0.040806,-0.151821,-0.083279,-0.164403,-0.061192,0.078081,0.013227,0.080741,-0.009417,0.115635,0.035217,0.112374,0.042201,-0.042376,0.06917,0.004743,0.20806,0.014474,-0.224932,-0.008684,-0.067732,-0.012835,0.010725,-0.074463,-0.055219,-0.107753,-0.013218,0.060181,-0.017857,-0.035505,-0.026005,-0.01505,-0.164568,-0.022893,-0.100307,-0.093009,-0.043648,-0.115108,0.088215,-0.058594,0.083182,-0.031023,0.036656,-0.252093,0.015939,0.057425,-0.040771,0.018973,-0.101493,0.013463,0.112584,-0.076538,0.080444,-0.059518,0.022269,-0.071045,0.0096,0.101998,-0.048429,-0.178327,0.152344,0.002829,0.005332,0.140695,-0.120571,0.003701,0.034738,-0.142273,0.101038,-0.035359,-0.086966,0.190186,0.037598,0.054199,-0.043039,0.047259,0.042759,0.041024,-0.114572,-0.101214,-0.070565,0.161482,-0.078108,-0.028477,-0.098319,-0.035052,-0.018206,0.108965,0.065918,0.005214,-0.074986,0.216518,0.094064,-0.088867,0.030814,-0.11465,0.086496,0.052891,0.030518,-0.088191,0.08102,-0.030252,0.023577,-0.175293,-0.041794,-0.13344,-0.080314,-0.114816,0.110439,-0.003209,-0.002853,-0.061157,0.063041,0.066333,0.031791,-0.063198,0.000349,0.088797,-0.110897,0.039873,-0.124176,-0.042899,-0.087123,-0.062901,0.036656,-0.020194,0.068593,0.078413,-0.077096,-0.024309,-0.07945,-0.037545,-0.13344,-0.023856,0.125244,-0.049037,0.053502,-0.112549,0.016881,0.063267,-0.008292,-0.042524,0.05242,0.039008,0.071472,0.006378,-0.009972,0.055612,3.5e-05,0.138939,-0.071307,-0.028948,0.031738,-0.279088,-0.097377,-0.038574,0.036447,-0.005057,0.015128,-0.153721,0.149937,-0.082419,0.012626,-0.013951,0.057329,-0.043588,-0.100307,-0.117083,0.140224,-0.054478,-0.083705,0.082014,-0.260812,-0.080963,0.151672,-0.012748,-0.0113,-0.022757,0.037545,-0.067191,-0.104614,-0.008057,0.092006,-0.001378,0.156948,-0.057055,-0.050502,-0.141741,0.073382,0.109767,0.082275,-0.014391,-0.02594,-0.014805,0.055934,-0.062047,0.025705,-0.02805,-0.090768,0.048671,-0.003732,-0.025142,-0.002267,0.000419,0.131958,0.014988,-0.085118,0.112462,-0.132882,0.061558,-0.120219,0.115653,-0.071987,-0.088632,0.068566,-0.002302,-0.085798,-0.021868,-0.117746,-0.038783,0.058184,-0.040423,-0.178545,-0.037197,0.107509,-0.017661,0.093157,0.087437,-0.090663,0.080645,0.064453,-0.106759,-0.033343,-0.038156,-0.006701,0.027335,0.049127,-0.015246,0.19308,-0.068952,-0.006553,-0.152274,-0.011509,-0.079381,0.107561,0.004373,-0.004211,-0.046247,0.041643,0.037737,0.025225,0.056946,-0.010141,-0.023019,-0.053563,-0.034999,0.099792,-0.096793,-0.012102,-0.139858,0.112514,0.174438,0.099505,-0.081822,-0.012748,-0.0512,-0.071638,-0.083104,0.02117,0.153721,-0.019165,-0.055054,-0.116455,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,41,0.0,6.408529,7.094235,7.222566,8.83215,"infographic, iot",daviddefelipe,david de felipe,2.0


## Create a train, validation and test datasets (from the main Train set of data)
> * I am facing a lack of computing resources (laptop with i7 Intel chip and 16 Go RAM, no GPU) which implies a very long time for training models, especially with the tuning of hyper-parameters. As a consequence, I have combined my computing resources with Google Colaboratory in order to tune several parameters in parallel.
* **The overall dataset is divided in 3 buckets:**
* Bucket 1 (train/test): split for training the Best Selected model (in case of more important computing resources)
* Bucket 2 (train1/valid1): split for training the Best model candidate of a given class (no cross-validation)
* Bucket 3 (train2/valid2): split for hyper-parameter tuning leading to select the Best model candidate (cross-validation maybe considered in some cases)
* We could limit the risk of overfitting by using a cross-validation approach. However, we may run the risk of very demanding computing resources as we will combine hyper-parameter optimization (GridSearch) and large dataset (194484 rows x 344 variables).
* A compromised approach would be to use the standard train/test dataset split and leverage cross-validation for the validation phase in the process for selecting the best model.

### Create X and y arrays

In [18]:
# Create an array from df_modeling2 excluding the target variable All impact bin
X = df_modeling2.drop(["All_impact bin"], axis=1)
X = np.array(X)
X.shape

(194484, 344)

In [19]:
# Create y array for the target variable All impact bin
y = df_modeling2["All_impact bin"]
y = np.array(y)
y.shape

(194484,)

In [20]:
# Convert the type of the input matrix to float
X = X.astype(np.float)

# Create train set
X_tr_main, X_test, y_tr_main, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

# Create validation and test sets for best model selected for a given class
X_tr_2nd, X_valid1, y_tr_2nd, y_valid1 = train_test_split(
    X_tr_main, y_tr_main, test_size=4000, train_size=15000, random_state=0
)

# Create validation and test sets for hyper-parameter tuning and selection of the best model candidate
X_tr_3rd, X_valid2, y_tr_3rd, y_valid2 = train_test_split(
    X_tr_2nd, y_tr_2nd, test_size=1500, train_size=5000, random_state=0
)

print("Train:", X_tr_main.shape, y_tr_main.shape)
print("Test:", X_test.shape, y_test.shape)
print("Train1:", X_tr_2nd.shape, y_tr_2nd.shape)
print("Valid1:", X_valid1.shape, y_valid1.shape)
print("Train2:", X_tr_3rd.shape, y_tr_3rd.shape)
print("Valid2:", X_valid2.shape, y_valid2.shape)

Train: (155587, 344) (155587,)
Test: (38897, 344) (38897,)
Train1: (15000, 344) (15000,)
Valid1: (4000, 344) (4000,)
Train2: (5000, 344) (5000,)
Valid2: (1500, 344) (1500,)


In [21]:
pd.value_counts(y_valid2, normalize=True)

2    0.364667
0    0.324000
1    0.311333
dtype: float64

### Create a Random Forest pipeline

In [22]:
# Create Random Forest pipeline
pipe_rf = Pipeline(
    [
        ("scaler", StandardScaler()),  # with standardization StandardScaler()
        (
            "PCA",
            PCA(n_components=200),
        ),  # 200 components to explain 95% of the variance (see first part of this notebook)
        ("rf", RandomForestClassifier(n_jobs=-1, random_state=0)),
    ]
)

In [23]:
# Get parameters
# pipe_rf.get_params()

### Define the grid of parameters
> * I have tested 2 sets of hyper-parameters (6 parameters, version underneath) and 4 hyperparameters (indicated in the section where the confusion matrix is performed)

In [24]:
# Grid of parameters
grid_rf = ParameterGrid(
    {
        "rf__n_estimators": [2000],  # of decision trees
        "rf__criterion": ["gini"],  # quality of split
        "rf__max_features": [
            0.5,
            0.8,
        ],  # % of features vs total number of features for looking at the best split
        "rf__max_depth": [5, 10],  # depth of trees
        "rf__min_samples_leaf": [
            2,
            4,
        ],  # # Minimum number of samples required at each leaf node
        "rf__bootstrap": [
            False
        ],  # Method of selecting samples for training each tree ('True' tested but do not improve the performance)
    }
)

# Print the number of combinations
print("Number of combinations:", len(grid_rf))

Number of combinations: 8


### Run the model on on sub-train data set (5 000 tweets) and test accuracy on the validation data set (1 500 tweets)

In [25]:
#  Save accuracy on train and validation sets
train_scores = []
valid_scores = []

# Enumerate combinations starting from 1
for i, params_dict in enumerate(grid_rf, 1):
    # Print progress
    print("Combination {}/{}".format(i, len(grid_rf)))  # Total number of combinations

    # Set parameters
    pipe_rf.set_params(**params_dict)

    # Fit a Decision Tree classifier
    pipe_rf.fit(X_tr_3rd, y_tr_3rd)

    # Save accuracy on validation set
    params_dict["accuracy_train"] = pipe_rf.score(X_tr_3rd, y_tr_3rd)
    params_dict["accuracy_valid"] = pipe_rf.score(X_valid2, y_valid2)

    # Save result
    train_scores.append(params_dict)
    valid_scores.append(params_dict)

print("done")

Combination 1/8
Combination 2/8
Combination 3/8
Combination 4/8
Combination 5/8
Combination 6/8
Combination 7/8
Combination 8/8
done


In [26]:
# Create DataFrame with test scores
scores_df = pd.DataFrame(valid_scores)
# Print scores
scores_df.sort_values(by="accuracy_valid", ascending=False)

Unnamed: 0,rf__bootstrap,rf__criterion,rf__max_depth,rf__max_features,rf__min_samples_leaf,rf__n_estimators,accuracy_train,accuracy_valid
5,False,gini,10,0.5,4,2000,0.9692,0.732667
4,False,gini,10,0.5,2,2000,0.9812,0.731333
0,False,gini,5,0.5,2,2000,0.7348,0.716667
1,False,gini,5,0.5,4,2000,0.7336,0.716
6,False,gini,10,0.8,2,2000,0.9676,0.708
7,False,gini,10,0.8,4,2000,0.9612,0.708
2,False,gini,5,0.8,2,2000,0.7278,0.706
3,False,gini,5,0.8,4,2000,0.7268,0.706


### Side notes
> * I have tested several combination of hyper-parameters separatly due to the limitation of computing resources (with different size of train/valid datasets, from 5000 to 15000 for train dataset)
* It appears that the accuracy rates do not vary much and there is in any case some overfitting

In [27]:
# accuracy_train	accuracy_valid	rf__bootstrap	rf__criterion	rf__max_depth	rf__max_features	rf__min_samples_leaf	rf__n_estimators
# 2	0.9810	0.672000	True	gini	10	0.8	2	500
# 0	0.9826	0.669333	True	gini	10	0.5	2	500
# 1	0.9716	0.669333	True	gini	10	0.5	4	500
# 3	0.9718	0.658000	True	gini	10	0.8	4	500
# 4	0.9890	0.656000	False	gini	10	0.5	2	500
# 5	0.9862	0.656000	False	gini	10	0.5	4	500
# 6	0.9768	0.624667	False	gini	10	0.8	2	500
# 7	0.9704	0.622000	False	gini	10	0.8	4	500

In [28]:
# accuracy_train	accuracy_valid	rf__bootstrap	rf__criterion	rf__max_depth	rf__max_features	rf__min_samples_leaf	rf__n_estimators
# 2	0.9810	0.672000	True	gini	10	0.8	2	500
# 0	0.9826	0.669333	True	gini	10	0.5	2	500
# 1	0.9716	0.669333	True	gini	10	0.5	4	500
# 3	0.9718	0.658000	True	gini	10	0.8	4	500
# 4	0.9890	0.656000	False	gini	10	0.5	2	500
# 5	0.9862	0.656000	False	gini	10	0.5	4	500
# 6	0.9768	0.624667	False	gini	10	0.8	2	500
# 7	0.9704	0.622000	False	gini	10	0.8	4	500

In [29]:
# accuracy_train	accuracy_valid	rf__criterion	rf__max_depth	rf__max_features	rf__n_estimators
# 1	0.9672	0.698000	gini	10	0.5	2000
# 3	0.9614	0.695333	gini	10	0.8	2000
# 2	0.9622	0.693333	gini	10	0.8	500
# 0	0.9668	0.692000	gini	10	0.5	500

### Evaluation confusion matrix on Top 2 models (based on accuracy) with depth of 10 and 5

### Model 1: (4 hyper-parameters tuned: n_estimators = 2000, criterion = gini , max_features = 0.5, max_depth = 5), depth of 5 to limit overfitting

In [30]:
# Create Random Forest pipeline
pipe_rf1 = Pipeline(
    [
        ("scaler", StandardScaler()),  # with standardization StandardScaler()
        (
            "PCA",
            PCA(n_components=200),
        ),  # 200 components to explain 95% of the variance (see first part of this notebook)
        (
            "rf",
            RandomForestClassifier(
                n_jobs=-1,
                n_estimators=2000,
                criterion="gini",
                max_features=0.5,
                warm_start=True,
                max_depth=5,
                random_state=0,
            ),
        ),
    ]
)

In [31]:
# Fit a Decision Tree classifier
model_rf1 = pipe_rf1.fit(X_tr_3rd, y_tr_3rd)

In [32]:
# Make prediction on X_valid dataset
y_pred_rf1 = pipe_rf1.predict(X_valid2)

In [33]:
# Confusions report
target_names = ["class 0", "class 1", "class 2"]
print(classification_report(y_valid2, y_pred_rf1, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.78      0.81      0.80       486
     class 1       0.57      0.48      0.52       467
     class 2       0.73      0.80      0.76       547

    accuracy                           0.70      1500
   macro avg       0.69      0.70      0.69      1500
weighted avg       0.70      0.70      0.70      1500



### Model 1 based on larger set of data (train 15 000, test 4 000) - (4 hyper-parameters tuned: n_estimators = 2000, criterion = gini, max_features = 0.5, max_depth = 5), depth of 5 to limit overfitting

In [34]:
# Create Random Forest pipeline
pipe_rf1b = Pipeline(
    [
        ("scaler", StandardScaler()),  # with standardization StandardScaler()
        (
            "PCA",
            PCA(n_components=200),
        ),  # 200 components to explain 95% of the variance (see first part of this notebook)
        (
            "rf",
            RandomForestClassifier(
                n_jobs=-1,
                n_estimators=2000,
                criterion="gini",
                max_features=0.5,
                warm_start=True,
                max_depth=5,
                random_state=0,
            ),
        ),
    ]
)

In [35]:
# Fit a Decision Tree classifier
model_rf1b = pipe_rf1b.fit(X_tr_2nd, y_tr_2nd)

In [36]:
y_pred_rf1b = pipe_rf1b.predict(X_valid1)

In [37]:
# Confusions report
target_names = ["class 0", "class 1", "class 2"]
print(classification_report(y_valid1, y_pred_rf1b, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.77      0.79      0.78      1240
     class 1       0.59      0.52      0.55      1328
     class 2       0.74      0.80      0.77      1432

    accuracy                           0.70      4000
   macro avg       0.70      0.70      0.70      4000
weighted avg       0.70      0.70      0.70      4000



### Model 5 (6 hyper-parameters tuned: n_estimators = 2000, min_samples_leaf = 4, bootstrap = False, criterion = gini, max_features = 0.5, max_depth = 10), model with the highest accuracy (but with some overfit)

In [38]:
# Create Random Forest pipeline
pipe_rf2 = Pipeline(
    [
        ("scaler", StandardScaler()),  # with standardization StandardScaler()
        (
            "PCA",
            PCA(n_components=200),
        ),  # 200 components to explain 95% of the variance (see first part of this notebook)
        (
            "rf",
            RandomForestClassifier(
                n_jobs=-1,
                n_estimators=2000,
                criterion="gini",
                min_samples_leaf=4,
                bootstrap=False,
                max_features=0.5,
                warm_start=True,
                max_depth=10,
                random_state=0,
            ),
        ),
    ]
)

In [39]:
# Fit a Decision Tree classifier
model_rf2 = pipe_rf2.fit(X_tr_3rd, y_tr_3rd)

In [40]:
# Make prediction on X_valid dataset
y_pred_rf2 = pipe_rf2.predict(X_valid2)

In [41]:
# Confusions report
target_names = ["class 0", "class 1", "class 2"]
print(classification_report(y_valid2, y_pred_rf2, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.83      0.78      0.81       486
     class 1       0.60      0.61      0.61       467
     class 2       0.77      0.81      0.79       547

    accuracy                           0.74      1500
   macro avg       0.74      0.73      0.73      1500
weighted avg       0.74      0.74      0.74      1500



### Conclusions
> * The Best model candidate for Random Forest classifier is the **model 4** Random Forest, n_estimators: 2 000, max_features = 0.5, max_depth = 5
* This model presents acceptable levels of accuracy (0.99 on train and 0.71 on valid). However, the difference between the 2 levels of accuracy indicates important overfitting
* We will perform a cross-validation with 5 folds in order to evaluate if we could limit the overfitting despite the small number of datapoints (due to limited computer resources)

## Run selected random Forest model on Train/Test dataset using cross-validation with 5 kfolds
> * It appears that (probably due to the small number of training dataset), models tends to overfit and do not generalize well. We will try a cross-validation approach with 5 folds in order to see if we could minimize the overfit despite the small number of datapoints (model 5)

### Create a cross validation object

In [42]:
# Create Random Forest pipeline
pipe_rf_grid = Pipeline(
    [
        ("scaler", StandardScaler()),  # with standardization StandardScaler()
        ("PCA", PCA()),
        ("rf", RandomForestClassifier(n_jobs=-1, random_state=0)),
    ]
)

In [43]:
# Create cross-validation object
grid_rf = GridSearchCV(
    pipe_rf_grid,
    [
        {
            "PCA__n_components": [
                200
            ],  # nb of components explaining 95% of the variance
            "rf__n_estimators": [2000],  # of decision trees
            "rf__criterion": ["gini"],  # quality of split
            "rf__max_features": [
                0.5
            ],  # % of features vs total number of features for looking at the best split
            "rf__max_depth": [10],  # depth of trees
        }
    ],
    cv=5,
)

In [44]:
# Fit estimator
grid_rf.fit(X_tr_3rd, y_tr_3rd)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('PCA',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               class_weight=None,
                                                      

In [45]:
# Get the results with "cv_results_"
grid_rf.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_PCA__n_components', 'param_rf__criterion', 'param_rf__max_depth', 'param_rf__max_features', 'param_rf__n_estimators', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [46]:
# Define the hyperparameters to collect into the dataframe as arrays
pca =  grid_rf.cv_results_["param_PCA__n_components"]
n_estimators = grid_rf.cv_results_["param_rf__n_estimators"]
criterion = grid_rf.cv_results_["param_rf__criterion"]
mean_te = grid_rf.cv_results_["mean_test_score"]
std_test_score =  grid_rf.cv_results_["std_test_score"]

In [50]:
# Create the dataframe using from_dict (as from_items is deprecated) and OrderedDict to keep the order
df_rf_grid = pd.DataFrame.from_dict(OrderedDict(zip(['pca', 'n_estimators', 'criterion', 'mean_te', 'std_test_score'], [pca, n_estimators, criterion, mean_te, std_test_score])))
df_rf_grid.sort_values(by='mean_te', ascending=False)

Unnamed: 0,pca,n_estimators,criterion,mean_te,std_test_score
0,200,2000,gini,0.7012,0.012698


### Conclusions on Random Forest with cross-validation
> * It is not really conclusive as the 2nd best model (with the same set of hyper-parameter tuning) is performing less (accuracy on test 0.8262) than the same one with train/valid dataset (acc. 0.85)
* As a consequence, we will select the standard approach which provides slightly better results and being less computational resource demanding

### Save results for later visualization and overall selection - Best Random Forest Model with 4 hyper-parameters ( n_estimators = 2000, criterion = 'gini', max_features = 0.5, max_depth = 5, )

In [None]:
rf_acc = 0.71
c1_rf_f1 = 0.80
c2_rf_f1 = 0.53
c3_rf_f1 = 0.76

%store rf_acc
%store c1_rf_f1
%store c2_rf_f1
%store c3_rf_f1