In [100]:
import pandas as pd
import numpy as np

In [101]:
df=pd.read_csv(r'dataset_final.csv',index_col='global_rank')

In [102]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [103]:
df.release_date=pd.to_datetime(df.release_date)

In [104]:
df.columns

Index(['title', 'id', 'release_date', 'day', 'month', 'year',
       'max_theaters_used', 'first_day_sales', 'first_weekend_sales',
       'first_week_sales', 'total_sales', 'viewers_by_theaters', 'tmdb_id',
       'budget', 'imdb_id', 'original_language', 'original_title', 'tagline',
       'cast1', 'cast1_popularity', 'cast2', 'cast2_popularity', 'cast3',
       'cast3_popularity', 'cast_popularity', 'genre1'],
      dtype='object')

In [105]:
df=df.loc[df['max_theaters_used']>0]

In [106]:
df.drop('original_title',axis=1,inplace=True)

In [107]:
df[['max_theaters_used','total_sales','viewers_by_theaters']].describe()

Unnamed: 0,max_theaters_used,total_sales,viewers_by_theaters
count,3958.0,3958.0,3958.0
mean,239.103588,491818.9,1981.812026
std,213.32996,974978.0,11812.947421
min,1.0,35.0,11.0
25%,65.0,48293.75,608.25
50%,178.0,145014.5,1019.0
75%,353.75,479181.0,1770.5
max,1205.0,19490690.0,425480.0


In [108]:
#Let's label by total sales and viewers by theater, so as to assess the commercial success of a movie. We will attribute a grade from 
#1 to 5, and add both.

bin_labels_5 =[1,2,3,4,5]
df['success_abs'] = pd.qcut(df['total_sales'], q=5, labels=bin_labels_5)
df['success_rel'] = pd.qcut(df['viewers_by_theaters'], q=5, labels=bin_labels_5)

In [109]:
#in order to turn categorical to numerical
df['success_rel'].replace([1,2,3,4,5], [1,2,3,4,5], inplace=True)
df['success_abs'].replace([1,2,3,4,5], [1,2,3,4,5], inplace=True)

In [110]:
df["success_total"]=df['success_abs']+df['success_rel']

In [111]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 3958 entries, 1.0 to 3959.0
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   title                3958 non-null   object        
 1   id                   3958 non-null   float64       
 2   release_date         3958 non-null   datetime64[ns]
 3   day                  3958 non-null   float64       
 4   month                3958 non-null   float64       
 5   year                 3958 non-null   float64       
 6   max_theaters_used    3958 non-null   float64       
 7   first_day_sales      3811 non-null   float64       
 8   first_weekend_sales  3791 non-null   float64       
 9   first_week_sales     3958 non-null   float64       
 10  total_sales          3958 non-null   float64       
 11  viewers_by_theaters  3958 non-null   float64       
 12  tmdb_id              2351 non-null   float64       
 13  budget               3958 n

In [112]:
df.success_total.value_counts()

10    569
7     487
5     469
6     462
2     456
4     430
3     413
8     389
9     283
Name: success_total, dtype: int64

In [113]:
bin_labels_4 =[1,2,3,4]
df['hit_or_fail'] = pd.qcut(df['success_total'], q=4, labels=bin_labels_4)

In [114]:
df['hit_or_fail']

global_rank
1.0       4
2.0       4
3.0       4
4.0       4
5.0       4
         ..
3954.0    1
3955.0    1
3956.0    1
3957.0    1
3959.0    1
Name: hit_or_fail, Length: 3958, dtype: category
Categories (4, int64): [1 < 2 < 3 < 4]

In [115]:
main_languages=['en','fr']
df['original_language']=df['original_language'].apply(lambda x: 'others' if x not in main_languages else x)

In [116]:
df['genre1'].value_counts()

Drama              796
Comedy             561
Action             191
Thriller           108
Animation          108
Documentary         82
Adventure           80
Horror              70
Crime               69
Romance             58
Family              42
Science Fiction     40
Fantasy             26
Music               20
War                 17
Mystery             14
History             14
Western              9
TV Movie             1
Name: genre1, dtype: int64

In [117]:
df.loc[df.genre1.isna()==True][['title','genre1']].head(20)

Unnamed: 0_level_0,title,genre1
global_rank,Unnamed: 1_level_1,Unnamed: 2_level_1
23.0,Les Petits mouchoirs,
24.0,Les Aventures de Tintin : Le secret de la Licorne,
31.0,Rogue One: A Star Wars Story,
51.0,Les Nouvelles aventures d'Aladin,
88.0,L'Age de glace : les lois de l'Univers,
145.0,Les Schtroumpfs,
157.0,Transformers 3 - La Face cachée de la Lune,
193.0,Les Croods,
194.0,Transformers : l'âge de l'extinction,
208.0,Suicide Squad,


In [118]:
df.loc[23,'genre1']='Drama'
df.loc[24,'genre1']='Action'
df.loc[51,'genre1']='Comedy'
df.loc[31,'genre1']='Science Fiction'
df.loc[88,'genre1']='Animation'
df.loc[145,'genre1']='Animation'
df.loc[193,'genre1']='Animation'
df.loc[194,'genre1']='Action'
df.loc[208,'genre1']='Action'
df.loc[211,'genre1']='Animation'
df.loc[212,'genre1']='Comedy'
df.loc[217,'genre1']='Drama'
df.loc[218,'genre1']='Action'
df.loc[226,'genre1']='Action'
df.loc[229,'genre1']='Comedy'
df.loc[233,'genre1']='Comedy'
df.loc[234,'genre1']='Romance'
df.loc[239,'genre1']='Romance'
df.loc[255,'genre1']='Thriller'
df.loc[282,'genre1']='Western'
df.loc[266,'genre1']='Romance'
df.loc[157,'genre1']='Action'
df.loc[240,'genre1']='Action'
df.loc[276,'genre1']='Animation'
df.loc[277,'genre1']='Comedy'
df.loc[292,'genre1']='Action'

In [119]:
df.loc[(df['title'].str.contains('2')==True)|(df['title'].str.contains('3')==True)|(df['title'].str.contains('4')==True)|(df['title'].str.contains('Star Wars')==True)].head()

Unnamed: 0_level_0,title,id,release_date,day,month,year,max_theaters_used,first_day_sales,first_weekend_sales,first_week_sales,...,cast2,cast2_popularity,cast3,cast3_popularity,cast_popularity,genre1,success_abs,success_rel,success_total,hit_or_fail
global_rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3.0,Star Wars: Le Réveil de la Force,13345.0,2015-12-16,16.0,12.0,2015.0,1093.0,619020.0,2705096.0,3801235.0,...,,,,,,Action,5,5,10,4
4.0,Le Roi Lion (2019),17528.0,2019-07-17,17.0,7.0,2019.0,820.0,630478.0,2559370.0,3252896.0,...,,,,,,Adventure,5,5,10,4
7.0,La Reine des neiges 2,17530.0,2019-11-20,20.0,11.0,2019.0,1020.0,361818.0,1979782.0,2137267.0,...,,,,,,Family,5,5,10,4
8.0,Star Wars: Les derniers Jedi,15113.0,2017-12-13,13.0,12.0,2017.0,1027.0,503727.0,2188853.0,2510462.0,...,,,,,,Science Fiction,5,5,10,4
14.0,Harry Potter et les reliques de la mort - part...,10193.0,2011-07-13,13.0,7.0,2011.0,833.0,732206.0,2532629.0,3129485.0,...,,,,,,Adventure,5,5,10,4


In [120]:
df['licence']=np.where((df['title'].str.contains('2')==True)|(df['title'].str.contains('3')==True)|(df['title'].str.contains('4')==True)|(df['title'].str.contains('Star Wars')==True),1,0)

In [121]:
#Encode language

In [122]:
df['original_language']

global_rank
1.0           fr
2.0           en
3.0           fr
4.0           en
5.0           fr
           ...  
3954.0        en
3955.0        en
3956.0        en
3957.0        fr
3959.0    others
Name: original_language, Length: 3958, dtype: object

In [123]:
df_dummies=pd.get_dummies(df,columns=['original_language'],prefix='lang')

In [124]:
df_dummies.columns

Index(['title', 'id', 'release_date', 'day', 'month', 'year',
       'max_theaters_used', 'first_day_sales', 'first_weekend_sales',
       'first_week_sales', 'total_sales', 'viewers_by_theaters', 'tmdb_id',
       'budget', 'imdb_id', 'tagline', 'cast1', 'cast1_popularity', 'cast2',
       'cast2_popularity', 'cast3', 'cast3_popularity', 'cast_popularity',
       'genre1', 'success_abs', 'success_rel', 'success_total', 'hit_or_fail',
       'licence', 'lang_en', 'lang_fr', 'lang_others'],
      dtype='object')

In [125]:
#Let's encode genres the same way 
major_genres=['Comedy','Drama','Action','Animation','Thriller']
df_dummies['genre1']=df_dummies['genre1'].apply(lambda x: 'others' if x not in major_genres else x)

In [126]:
df_dummies=pd.get_dummies(df_dummies,columns=['genre1'],prefix='genre')

In [127]:
#Quarter to encode for release date
#First with Q1,Q2..., then get_dummies

In [128]:
bin_labels_q =[1,2,3,4]

df_dummies['quarter']=pd.qcut(df_dummies['month'], q=4, labels=bin_labels_q)

In [129]:
df_dummies=pd.get_dummies(df_dummies,columns=['quarter'],prefix='Q')

In [130]:
#FEATURES SELECTION

In [131]:
df_dummies.columns

Index(['title', 'id', 'release_date', 'day', 'month', 'year',
       'max_theaters_used', 'first_day_sales', 'first_weekend_sales',
       'first_week_sales', 'total_sales', 'viewers_by_theaters', 'tmdb_id',
       'budget', 'imdb_id', 'tagline', 'cast1', 'cast1_popularity', 'cast2',
       'cast2_popularity', 'cast3', 'cast3_popularity', 'cast_popularity',
       'success_abs', 'success_rel', 'success_total', 'hit_or_fail', 'licence',
       'lang_en', 'lang_fr', 'lang_others', 'genre_Action', 'genre_Animation',
       'genre_Comedy', 'genre_Drama', 'genre_Thriller', 'genre_others', 'Q_1',
       'Q_2', 'Q_3', 'Q_4'],
      dtype='object')

In [132]:
df_dummies.to_csv('dataset_dummies2.csv')