# Imports

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import boxcox, chi2_contingency, f, f_oneway, probplot
import seaborn as sns

# Loading

In [30]:
# importation du modèle
df = pd.read_csv('films_db2.csv')

# Overview

In [21]:
df.shape

(2366, 21)

In [33]:
df.columns

Index(['title', 'year', 'id_jp', 'url_allo', 'director', 'country', 'duration',
       'genre', 'first_day', 'first_week', 'first_weekend', 'hebdo_rank',
       'total_spectator', 'copies', 'rating_press', 'rating_public', 'casting',
       'budget', 'lang', 'visa', 'award'],
      dtype='object')

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2366 entries, 0 to 2365
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            2366 non-null   object 
 1   year             2366 non-null   int64  
 2   id_jp            1645 non-null   object 
 3   url_allo         1645 non-null   object 
 4   director         2366 non-null   object 
 5   country          2366 non-null   object 
 6   duration         2366 non-null   int64  
 7   genre            2366 non-null   object 
 8   first_day        2366 non-null   int64  
 9   first_week       2366 non-null   int64  
 10  first_weekend    2366 non-null   int64  
 11  hebdo_rank       2366 non-null   int64  
 12  total_spectator  2366 non-null   int64  
 13  copies           2366 non-null   int64  
 14  rating_press     1645 non-null   float64
 15  rating_public    1645 non-null   float64
 16  casting          1645 non-null   object 
 17  budget        

In [13]:
df.head

<bound method NDFrame.head of                        title  year                             id_jp  \
0           7h58 ce samedila  2007  0010478f4e067fedf4634d0a06f0f0ad   
1            menteur menteur  1997  00cc02716a6d42f38dd2efaa773b6900   
2                dream house  2011  02acc86450b3f7c7eb34ecc6c4638bea   
3                      paris  2008  02d8e008f9b224bc32c31ff34d4b90b8   
4                 two lovers  2008  03101f932285a244f3887c89e43f519d   
...                      ...   ...                               ...   
2361                paycheck  2003  fc217cad46530edea35d642d6876e87e   
2362          licorice pizza  2021  fcc1c34a7cc6b061a61cc2b20b088497   
2363             after earth  2013  fd06145978961e804a966255aa2aa595   
2364        un jour sans fin  1993  fd2184dca7908ce9669fc991ec829b38   
2365  mon pere est ingenieur  2004                               NaN   

                                               url_allo              director  \
0     https://www.alloci

In [23]:
df.sample(5)

Unnamed: 0,title,year,id_jp,url_allo,director,country,duration,genre,first_day,first_week,...,hebdo_rank,total_spectator,copies,rating_press,rating_public,casting,budget,lang,visa,award
32,meurtre en suspens,1995,139a67541f36505d95e764afa46e0026,https://www.allocine.fr/film/fichefilm_gen_cfi...,john badham,etatsunis,5400,thriller,-1,261958,...,5,437221,113,-1.0,2.7,"[""Johnny Depp"", ""Christopher Walken"", ""Peter S...",-1.0,"[""anglais""]",-1.0,0.0
755,yesterday,2019,8a1c9587c99763bda546b60708e00007,https://www.allocine.fr/film/fichefilm_gen_cfi...,danny boyle,grandebretagne,6720,comedie,-1,215264,...,4,678617,398,3.5,3.7,"[""Himesh Patel"", ""Lily James"", ""Ed Sheeran"", ""...",-1.0,"[""anglais""]",151170.0,2.0
1508,le cousin,1997,,,alain corneau,france,6720,thriller,-1,224237,...,4,856606,410,,,,,,,
1413,mensonges et trahisons et plus si affinites,2004,,,laurent tirard,france,5400,comedie,40378,251378,...,3,743201,265,,,,,,,
208,soyez sympas rembobinez,2008,68c6505e04cd095f8c708c36a0393455,https://www.allocine.fr/film/fichefilm_gen_cfi...,michel gondry,etatsunis,6060,comedie,20821,146496,...,6,505993,190,3.8,3.6,"[""Jack Black"", ""Yasiin Bey"", ""Danny Glover"", ""...",20000000.0,"[""anglais""]",119181.0,0.0


# Double Check 

In [24]:
df.isna().sum()

title                0
year                 0
id_jp              721
url_allo           721
director             0
country              0
duration             0
genre                0
first_day            0
first_week           0
first_weekend        0
hebdo_rank           0
total_spectator      0
copies               0
rating_press       721
rating_public      721
casting            721
budget             721
lang               721
visa               721
award              721
dtype: int64

# Duplicates

In [25]:
df.duplicated().sum()

0

# Outliers 

In [27]:
df.describe()

Unnamed: 0,year,duration,first_day,first_week,first_weekend,hebdo_rank,total_spectator,copies,rating_press,rating_public,budget,visa,award
count,2366.0,2366.0,2366.0,2366.0,2366.0,2366.0,2366.0,2366.0,1645.0,1645.0,1645.0,1645.0,1645.0
mean,2006.919696,6806.246407,59769.836433,397269.3,271466.8,5.365173,1173218.0,345.66399,2.591489,3.167052,119440900.0,2553577.0,1.071733
std,8.240485,1189.719189,99650.571608,498231.6,422361.3,4.39712,1694421.0,215.939483,1.665822,0.708522,1902005000.0,70339900.0,2.964463
min,1986.0,-1.0,-1.0,6774.0,-1.0,1.0,8403.0,-1.0,-1.0,1.0,-1.0,-1.0,0.0
25%,2000.0,6000.0,5591.0,108599.5,-1.0,2.0,252642.2,179.25,2.4,2.7,-1.0,89612.0,0.0
50%,2007.0,6600.0,24156.0,223482.0,122160.5,4.0,594794.0,307.0,3.1,3.3,12000000.0,110571.0,0.0
75%,2013.0,7440.0,62103.5,465275.5,328591.8,8.0,1362377.0,488.0,3.7,3.7,55000000.0,133345.0,1.0
max,2024.0,12360.0,850297.0,4378720.0,3586497.0,20.0,20634790.0,1093.0,5.0,4.6,65000000000.0,2023007000.0,54.0


# Binary Encoding 

In [38]:
df.director.unique()

array(['sidney lumet', 'tom shadyac', 'jim sheridan', 'cedric klapisch',
       'james gray', 'andre techine', 'pascal thomas', 'patrice leconte',
       'stephen frears', 'ridley scott', 'woody allen', 'james cameron',
       'harold ramis', 'peter hyams', 'joel schumacher', 'thomas gilou',
       'claude lelouch', 'pierre salvadori', 'pj hogan', 'renny harlin',
       'claude miller', 'brett ratner', 'mathieu kassovitz', 'jj abrams',
       'kenneth branagh', 'brian levant', 'eric lartigau', 'claude berri',
       'alan j pakula', 'jon favreau', 'roger spottiswoode',
       'michael mann', 'john badham', 'rob minkoff', 'barbet schroeder',
       'dennis dugan', 'edgar wright', 'francis veber', 'richard donner',
       'barry levinson', 'robert rodriguez', 'roland emmerich',
       'bertrand tavernier', 'wong karwai', 'shawn levy', 'george miller',
       'jonathan mostow', 'andrew stanton', 'todd phillips',
       'neil jordan', 'jean becker', 'ang lee', 'simon west',
       'oliver 

In [39]:
df.country.unique()


array(['etatsunis', 'france', 'grandebretagne', 'australie', 'irlande',
       'danemark', 'chine', 'canada', 'italie', 'espagne', 'japon',
       'allemagne', 'nouvellezelande', 'belgique', 'mexique', 'suisse',
       'serbie', 'autriche', 'taiwan', 'argentine', 'bresil'],
      dtype=object)

In [40]:
df.genre.unique()

array(['thriller', 'comedie', 'drame', 'aventure action', 'romance',
       'science fiction', 'comicbook', 'film familial', 'guerre',
       'animation', 'comedie dramatique', 'western', 'fantasy',
       'documentaire', 'horreur', 'musical', 'catastrophe', 'peplum',
       'x erotique', 'courtmetrage'], dtype=object)

In [43]:
df.lang.unique()

array(['["anglais"]', '["anglais", "allemand"]', '["francais"]',
       '["danois", "anglais", "espagnol"]',
       '["arabic", "anglais", "francais", "allemand"]',
       '["cantonais", "anglais", "japonais"]',
       '["anglais", "francais", "russe"]', '["cantonais", "anglais"]',
       '[""]', '["anglais", "espagnol"]',
       '["anglais", "francais", "grec", "italien"]',
       '["francais", "allemand"]', '["anglais", "mandarin"]',
       '["anglais", "hebreu"]', '["anglais", "francais"]',
       '["arabic", "anglais", "allemand", "turc"]',
       '["anglais", "francais", "allemand"]',
       '["anglais", "francais", "hindi", "portugais", "russe"]',
       '["cantonais", "anglais", "mandarin"]',
       '["cantonais", "anglais", "japonais", "mandarin"]',
       '["anglais", "russe", "turc", "ourdou"]', '["arabic", "francais"]',
       '["anglais", "francais", "islandais", "russe"]',
       '["cantonais", "japonais", "mandarin"]',
       '["anglais", "francais", "allemand", "italien"

In [50]:
df.award.nlargest(10)

1433    54.0
205     26.0
504     25.0
1067    23.0
622     22.0
2110    21.0
1752    19.0
2245    19.0
114     18.0
447     17.0
Name: award, dtype: float64

In [49]:
df.budget.nlargest(10)

1371    6.500000e+10
1410    4.090300e+10
223     4.187000e+09
1799    3.043000e+09
509     2.415000e+09
2017    2.384000e+09
519     2.175000e+09
1639    1.674000e+09
22      1.606000e+09
1310    1.575000e+09
Name: budget, dtype: float64

In [51]:
df.award.nsmallest(10)

0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
7     0.0
8     0.0
9     0.0
10    0.0
Name: award, dtype: float64

In [52]:
df.budget.nsmallest(10)

5    -1.0
6    -1.0
7    -1.0
12   -1.0
15   -1.0
19   -1.0
20   -1.0
26   -1.0
27   -1.0
32   -1.0
Name: budget, dtype: float64