In [3]:

# librairies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from random import shuffle
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score, precision_score, recall_score, mean_squared_error

# import des données

pd.set_option('display.max_columns', None)
yt = pd.read_csv('challenge_youtube_toxic.csv', encoding = 'latin1', engine = 'python', sep=';')

# Préparation des données (input et output)
# On utilise LabelEncoder pour remplacer les colonnes contenant du texte, par des numéros.
# Pour chaque colonne, on attribue un numéro unique aux valeurs. S'il y a deux valeurs identiques dans la colonne
# contenant des chaînes de caractères, le numéro sera le même. Par exemple, 'LeHuffPost' aura un numéro unique dans
# la colonne channel_name_n.

X = yt
y = yt['nbrMotInsulte'] # variable de sortie
X.drop(['nbrMotInsulte'], axis = 1)

le_channel_name = LabelEncoder()
le_video_id_court = LabelEncoder()
le_video_id = LabelEncoder()
# le_channel_id = LabelEncoder() # on n'utilise pas channel_id_n car channel_name_n contient déjà toutes les informations.
le_categorie_new = LabelEncoder()
le_categ_inst = LabelEncoder()

# Dans ce qui suit, on rajoute les colonnes numériques, et supprimons les colonnes chaînes de caractère.
# nous n'avons pas besoin des colonnes video_id_n et video_id_court_n car toutes les valeurs sont différentes
# et n'aideront pas à la séparation au niveau d'un noeud dans l'arbre de décision.

X['channel_name_n'] = le_channel_name.fit_transform(X['channel_name'])
# X['video_id_court_n'] = le_video_id_court.fit_transform(X['video_id_court'])
# X['video_id_n'] = le_video_id.fit_transform(X['video_id'])
# X['channel_id_n'] = le_channel_id.fit_transform(X['channel_id'])
X['categorie_new_n'] = le_categorie_new.fit_transform(X['categorie_new'])
X['categ_inst_n'] = le_categ_inst.fit_transform(X['categ_inst'])

X = X.drop(['video_id_court', 'video_id', 'channel_id', 'channel_name',\
                  'categorie_new', 'categ_inst'], axis = 'columns')

# Les données d'entrainement et de test.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

# On construit le modèle d'arbre de décision + fit sur le training set.

model = tree.DecisionTreeRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test) # y_prediction

# Validating the model ?
# print(y_pred, y_test)

print(f'Mean absolute error is{mean_absolute_error(y_test, y_pred) : .2f}')

# Other metrics for validation ?
# print(f'Accuracy score is{accuracy_score(y_test, y_pred) : .2f}')
# print(f'F1-score is{f1_score(y_test, y_pred, average="macro") : .2f}')
# print(f'Precision is{precision_score(y_test, y_pred, average="macro"): .2f}')
# print(f'Recall is{recall_score(y_test, y_pred, average="macro"): .2f}')

# on essaye différents paramètres afin d'optimiser l'arbre de décision

# criteria = ['mse', 'friedman_mse', 'mae']
# splitting_criteria = ['best', 'random']
tree_depth = [i for i in range(4, 25)] # le mse est trop grand si la profondeur de 
                                        # l'arbre est trop faible et il faut faire attention à ne pas overffiter
sample_split = [i for i in range(2,10)]
sample_leaf = [i for i in range(1,6)]

# on garde les mêmes train set et test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)
best_depth = 0
best_min_samples_split = 0
best_min_samples_leaf = 0
best_mse = 10**4
for depth in tree_depth :
    for split_sample in sample_split :
        for leaf_sample in sample_leaf :
            model = tree.DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=depth, min_samples_split=split_sample, min_samples_leaf=leaf_sample)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            if mean_squared_error(y_test, y_pred) < best_mse :
                best_mse = mean_squared_error(y_test, y_pred)
                best_depth = depth
                best_min_samples_split = split_sample
                best_min_samples_leaf = leaf_sample
            else :
                best_mse = best_mse
            print('( Tree depth :', depth, ', min_samples_split :', split_sample,\
                  ', min_samples_leaf :', leaf_sample, ')', 'Mean squared error :',\
                  mean_squared_error(y_test, y_pred))
print('( Best depth :', best_depth, ', best_min_samples_split :', best_min_samples_split,\
      ', best_min_samples_leaf :', best_min_samples_leaf, ')', 'Best mean squared error :',\
      best_mse)

# K-fold pou le cross-validation, avec les meilleurs paramètres qu'on a trouvé ci-dessus

scores = []
from sklearn.model_selection import KFold
kf = KFold(n_splits = 6)
for train_index, test_index in kf.split(X) :
    # print(train_index, test_index)
    # print(type(train_index))
    X_train, X_test, y_train, y_test = X.to_numpy()[train_index, :], X.to_numpy()[test_index, :], y.to_numpy()[train_index], y.to_numpy()[test_index]
    model = tree.DecisionTreeRegressor(max_depth=10, min_samples_split=2, min_samples_leaf=1)
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
print('Score du modèle :', np.mean(scores))

Avec plusieurs run du modèle DT, on s'aperçoit que le score varie entre 0.70 et 0.94.

RANDOM FOREST REGRESSOR

# RandomForestRegressor with cross-validation

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
random_forest_score = []
kf = KFold(n_splits = 6)
for train_index, test_index in kf.split(X) :
    X_train, X_test, y_train, y_test = X.to_numpy()[train_index, :], X.to_numpy()[test_index, :], y.to_numpy()[train_index], y.to_numpy()[test_index]
    rd = RandomForestRegressor(n_estimators=40, max_depth=10, min_samples_split=2, min_samples_leaf=1)
    rd.fit(X_train, y_train)
    random_forest_score.append(rd.score(X_test, y_test))
print(np.mean(random_forest_score))


SyntaxError: invalid syntax (<ipython-input-3-b8bcda159987>, line 120)