In [1]:
# Importing usefull libraries for data pre-processing and in order to run Machine Learning model

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import *
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Reading dataframe

In [4]:
df = pd.read_csv('../data/movies_data.csv')

In [5]:
# Displaying columns

In [6]:
df.columns

Index(['Titre', 'Réalisateur', 'Scénariste', 'Compositeur', 'Directeur photo',
       'Directeur montage', 'Acteur 1', 'Acteur 2', 'Acteur 3', 'Genre',
       'Popularité genre', 'Popularité thème', 'Rareté émotion', 'Référence',
       'Budget (M$)', 'Box Office (M$)', 'Rentabilité (%)', 'Succès'],
      dtype='object')

In [7]:
######################################################################################################################

In [8]:
# Getting unique Id and the feature we wanna target

In [9]:
ID = "Titre"
target = "Succès"

In [10]:
# Getting categorical features in order to transorm it later into "dummies"

In [11]:
features_categ = ['Réalisateur', 'Scénariste', 'Compositeur', 'Directeur photo', 'Directeur montage',
             'Acteur 1', 'Acteur 2', 'Acteur 3', 'Genre']

In [12]:
# Getting numerical features

In [13]:
features_numerical = ['Popularité genre', 'Popularité thème', 'Rareté émotion', 'Référence',
       'Budget (M$)']

In [14]:
# Defining which column we want to keep

In [15]:
to_keep = [ID] + features_numerical + features_categ + [target]

In [16]:
# Getting in the our datafame only the column we want to keep

In [17]:
df = df[to_keep]

In [18]:
# Getting dummies on categorials features

In [19]:
df = pd.get_dummies(df, columns=features_categ)

In [20]:
# Creating train and test dataframes by dividing df into two different dataframes 

In [21]:
df_train = df[:258]
df_test = df[258:]

In [22]:
# Dropping Id and Target inside intputs: X_train and X_test

In [23]:
X_train = df_train.drop([ID,target],axis=1)
X_test = df_test.drop([ID,target],axis=1)

In [24]:
# Getting outputs as y_train and y_test

In [25]:
y_train = df_train[target]
y_test = df_test[target]

In [26]:
# Creating Random Forest Classifier

In [27]:
clf = RandomForestClassifier(class_weight='balanced', 
                             verbose=1,
                             n_estimators=500,
                             random_state=46,
                             max_depth=6)

In [28]:
# Training the model using the training dataset

In [29]:
clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=6, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=500, n_jobs=None, oob_score=False,
                       random_state=46, verbose=1, warm_start=False)

In [30]:
# Creating y_pred variable representing the predictions made by the model on "Succès" feature
# for each row we wanted to predict

In [31]:
y_pred = clf.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished


In [32]:
# Getting Scores

In [33]:
y_pred_proba = clf.predict_proba(X_test)
y_pred_proba

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished


array([[0.32015453, 0.29553332, 0.38431215],
       [0.30938733, 0.2699008 , 0.42071187],
       [0.31684666, 0.2561062 , 0.42704714],
       [0.3272293 , 0.25748274, 0.41528796],
       [0.30685592, 0.26351186, 0.42963222],
       [0.30456167, 0.26490977, 0.43052856],
       [0.33438465, 0.2554182 , 0.41019716],
       [0.3384105 , 0.31060434, 0.35098516],
       [0.32331219, 0.26047805, 0.41620976],
       [0.34019462, 0.24002571, 0.41977967],
       [0.34889994, 0.29017517, 0.3609249 ],
       [0.33270525, 0.27458281, 0.39271194],
       [0.36705392, 0.28958486, 0.34336122],
       [0.36138931, 0.28804283, 0.35056785],
       [0.33425885, 0.24847854, 0.41726261],
       [0.33176354, 0.25630144, 0.41193502],
       [0.32662034, 0.28871075, 0.38466891],
       [0.31508146, 0.32197014, 0.3629484 ],
       [0.30301943, 0.26093926, 0.43604131],
       [0.33456588, 0.28029336, 0.38514077]])

In [34]:
# Getting values we need for the analyze inside X_test dataframe

In [35]:
X_test['Titre'] = df_test.Titre

In [36]:
X_test['y_pred'] = y_pred

In [37]:
X_test['y_test'] = y_test

In [38]:
X_test

Unnamed: 0,Popularité genre,Popularité thème,Rareté émotion,Référence,Budget (M$),Réalisateur_Adam McKay,Réalisateur_Adam Robitel,Réalisateur_Alejandro González Iñárritu,Réalisateur_Alex Garland,Réalisateur_Alex Proyas,...,Genre_Horreur,Genre_Policier,Genre_Romance,Genre_SF,Genre_Super-héros,Genre_Thriller,Genre_Western,Titre,y_pred,y_test
258,2,2,1,1,90.0,0,0,0,0,0,...,0,0,0,0,0,0,0,Once upon a time in Hollywood,2,2
259,2,2,2,3,60.0,1,0,0,0,0,...,0,0,0,0,0,0,0,Vice,2,1
260,2,2,2,3,15.0,0,0,0,0,0,...,0,0,0,0,0,0,0,Blackklansman,2,2
261,2,2,2,2,15.0,0,0,0,0,0,...,0,0,0,0,0,0,0,The Favourite,2,2
262,3,2,2,3,23.0,0,0,0,0,0,...,0,0,0,0,0,0,0,Green book,2,2
263,3,3,3,2,200.0,0,0,0,0,0,...,0,0,0,1,0,0,0,Alita,2,2
264,3,1,2,0,10.0,0,0,0,0,0,...,0,0,0,0,0,0,0,The Dead don't die,2,1
265,2,1,1,1,150.0,0,0,0,0,0,...,0,0,0,0,0,0,0,Detective Pikachu,2,2
266,2,2,3,3,40.0,0,0,0,0,0,...,0,0,0,0,0,0,0,Rocketman,2,2
267,0,2,2,1,9.0,0,1,0,0,0,...,1,0,0,0,0,0,0,Escape room,2,2


In [39]:
################################
#### performance indicators ####

In [40]:
# Getting the accuracy of the model

In [41]:
print("Precision Score : ",precision_score(y_test, y_pred, 
                                           pos_label='positive',
                                           average='micro'))

Precision Score :  0.85




In [42]:
# Confusion matrix

In [43]:
confusion_matrix(y_test, y_pred)

array([[ 2,  0,  1],
       [ 0,  0,  2],
       [ 0,  0, 15]])

In [44]:
#### performance indicators ####
################################

In [45]:
# Getting the feature importances

In [46]:
importance = pd.Series(clf.feature_importances_)

In [47]:
# Getting the most important features by sorting it

In [48]:
importance.sort_values(ascending = False)

2       0.032653
4       0.030472
3       0.015895
747     0.014512
640     0.011719
545     0.011297
1       0.010805
976     0.010494
1551    0.009569
1560    0.009428
0       0.009140
673     0.009119
413     0.008563
361     0.007756
244     0.007723
690     0.007217
375     0.006023
1193    0.006012
36      0.006002
63      0.005943
150     0.005837
170     0.005795
1565    0.005389
443     0.005260
1569    0.005122
1567    0.005096
9       0.004691
344     0.004416
56      0.004226
1027    0.004153
          ...   
622     0.000000
1346    0.000000
1345    0.000000
705     0.000000
686     0.000000
702     0.000000
701     0.000000
700     0.000000
699     0.000000
698     0.000000
697     0.000000
1333    0.000000
692     0.000000
691     0.000000
1334    0.000000
687     0.000000
1336    0.000000
1344    0.000000
684     0.000000
683     0.000000
681     0.000000
677     0.000000
1341    0.000000
675     0.000000
1342    0.000000
1343    0.000000
670     0.000000
669     0.0000

In [49]:
# Displaying names of the most important columns of the feature importance

In [50]:
sorted(zip(clf.feature_importances_, X_test.columns), reverse=True)

[(0.03265275467905801, 'Rareté émotion'),
 (0.030472105390347132, 'Budget (M$)'),
 (0.015895117033611107, 'Référence'),
 (0.01451153040123768, 'Directeur montage_Hervé de Luze'),
 (0.011718708557561875, 'Directeur photo_PaPaweł Edelman'),
 (0.011297013734105158, 'Directeur photo_Benoît Debie'),
 (0.01080503512465339, 'Popularité thème'),
 (0.010494199690997165, 'Acteur 1_Joaquin Phoenix'),
 (0.009569109690723469, 'Genre_Action'),
 (0.00942750705407894, 'Genre_Fantastique'),
 (0.009139766980914654, 'Popularité genre'),
 (0.009119096925991527, 'Directeur photo_Stéphane Fontaine'),
 (0.008563148573047215, 'Compositeur_Daniel Pemberton'),
 (0.0077560927035606846, 'Scénariste_Steven Knight'),
 (0.007722508136893168, 'Scénariste_Gaspar Noé'),
 (0.007217208619263871, 'Directeur montage_Alexander Berner'),
 (0.006022614503006194, 'Scénariste_Wachowskis'),
 (0.006012231703372192, 'Acteur 2_Kate Winslet'),
 (0.00600246246501806, 'Réalisateur_Danny Boyle'),
 (0.005943283868568664, 'Réalisateur_Gu