In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from vacances_scolaires_france import SchoolHolidayDates
import datetime
pd.set_option('display.max_columns', 500)



# df = pd.read_csv('../films_db2.csv')
# df.head(10)

In [2]:
from db.database_mysql import engine

df = pd.read_sql_query('''SELECT jp.title, jp.date, jp.`year`, jp.director, jp.country, jp.duration, jp.genre, jp.first_day, jp.first_week, jp.first_weekend, jp.hebdo_rank, jp.total_spectator, jp.copies, fa1.distributor,fa1.rating_press, fa1.rating_public, fa1.casting, fa1.budget, fa1.lang, fa1.visa, fa1.award 
FROM films_jp as jp
LEFT JOIN films_allo fa1 ON fa1.id_jp = jp.id                     
  where fa1.year_allo is not null and fa1.year_allo != -1
  group by fa1.id_jp, jp.`year`, jp.director''', engine)

df.head()

Unnamed: 0,title,date,year,director,country,duration,genre,first_day,first_week,first_weekend,hebdo_rank,total_spectator,copies,distributor,rating_press,rating_public,casting,budget,lang,visa,award
0,rien ne va plus,1997-10-15,1997,claude chabrol,france,6300,thriller,-1,172230,-1,3,458125,234,Carlotta Films,-1.0,3.0,"[""Michel Serrault"", ""Isabelle Huppert"", ""Franç...",-1,"[""francais"", ""hongrois""]",90016,0
1,la fille sur le pont,1999-03-31,1999,patrice leconte,france,5400,drame,-1,154881,-1,5,663390,198,,3.4,3.6,"[""Vanessa Paradis"", ""Daniel Auteuil"", ""Claude ...",-1,"[""francais""]",90876,1
2,grosse fatigue,1994-05-18,1994,michel blanc,france,5220,comedie,-1,417021,-1,2,2015230,189,,-1.0,2.7,"[""Michel Blanc"", ""Philippe Noiret"", ""Marie-Ann...",-1,"[""francais""]",82266,1
3,braveheart,1995-10-04,1995,mel gibson,etatsunis,9900,aventure action,-1,355642,-1,3,1231534,302,ufd,2.6,4.2,"[""Mel Gibson"", ""Sophie Marceau"", ""Catherine Mc...",-1,"[""anglais""]",88091,9
4,soleil trompeur,1994-08-31,1994,,russie,9120,comedie dramatique,-1,101953,-1,7,529790,129,,-1.0,3.8,"[""Nikita Mikhalkov"", ""Oleg Menshikov"", ""Ingebo...",-1,"[""francais"", ""russe""]",-1,0


In [3]:
df.isnull().sum()

title                 0
date                  0
year                  0
director           1923
country               0
duration              0
genre                 0
first_day             0
first_week            0
first_weekend         0
hebdo_rank            0
total_spectator       0
copies                0
distributor           0
rating_press          0
rating_public         0
casting               0
budget                0
lang                  0
visa                  0
award                 0
dtype: int64

In [4]:
# df = df.dropna()

In [5]:
df.nunique()

title              3630
date               1482
year                 34
director            351
country              33
duration            119
genre                18
first_day          3031
first_week         3611
first_weekend      2856
hebdo_rank           15
total_spectator    3624
copies              757
distributor         155
rating_press         41
rating_public        40
casting            3290
budget              212
lang                223
visa               2964
award                27
dtype: int64

In [6]:
realisateurs_counts = df['director'].value_counts()

realisateurs_counts_df = pd.DataFrame(realisateurs_counts)
realisateurs_counts_df.reset_index(inplace=True)
realisateurs_counts_df.columns = ['director', 'Nombre de films']

realisateurs_counts_df


Unnamed: 0,director,Nombre de films
0,woody allen,26
1,clint eastwood,21
2,steven spielberg,21
3,ridley scott,15
4,patrice leconte,14
...,...,...
346,norman jewison,1
347,frank darabont,1
348,edouard molinaro,1
349,john badham,1


In [7]:
spectateurs_par_realisateur = df.groupby('director')['total_spectator'].sum().reset_index()


spectateurs_par_realisateur


Unnamed: 0,director,total_spectator
0,adrian lyne,1945672
1,agnes jaoui,7470111
2,alain berberian,9493936
3,alain chabat,26460119
4,alain corneau,3274536
...,...,...
346,yvan attal,3757895
347,yves robert,298266
348,zabou breitman,1934361
349,zack snyder,10537603


In [8]:
merged_df = pd.merge(realisateurs_counts_df, spectateurs_par_realisateur, left_on='director', right_on='director', how='left')

merged_df['director_score'] = df['first_week'] / realisateurs_counts_df['Nombre de films']

merged_df


Unnamed: 0,director,Nombre de films,total_spectator,director_score
0,woody allen,26,25511713,6.624231e+03
1,clint eastwood,21,31794304,7.375286e+03
2,steven spielberg,21,52226817,1.985814e+04
3,ridley scott,15,16621524,2.370947e+04
4,patrice leconte,14,19739165,7.282357e+03
...,...,...,...,...
346,norman jewison,1,353216,4.631110e+05
347,frank darabont,1,1740897,2.537450e+06
348,edouard molinaro,1,1935708,2.763380e+05
349,john badham,1,437221,1.862230e+05


In [9]:
df = pd.merge(df, merged_df[['director', 'director_score']], on='director', how='left')

# Afficher les premières lignes du DataFrame fusionné pour vérification
df

Unnamed: 0,title,date,year,director,country,duration,genre,first_day,first_week,first_weekend,hebdo_rank,total_spectator,copies,distributor,rating_press,rating_public,casting,budget,lang,visa,award,director_score
0,rien ne va plus,1997-10-15,1997,claude chabrol,france,6300,thriller,-1,172230,-1,3,458125,234,Carlotta Films,-1.0,3.0,"[""Michel Serrault"", ""Isabelle Huppert"", ""Franç...",-1,"[""francais"", ""hongrois""]",90016,0,16565.500000
1,la fille sur le pont,1999-03-31,1999,patrice leconte,france,5400,drame,-1,154881,-1,5,663390,198,,3.4,3.6,"[""Vanessa Paradis"", ""Daniel Auteuil"", ""Claude ...",-1,"[""francais""]",90876,1,7282.357143
2,grosse fatigue,1994-05-18,1994,michel blanc,france,5220,comedie,-1,417021,-1,2,2015230,189,,-1.0,2.7,"[""Michel Blanc"", ""Philippe Noiret"", ""Marie-Ann...",-1,"[""francais""]",82266,1,32631.750000
3,braveheart,1995-10-04,1995,mel gibson,etatsunis,9900,aventure action,-1,355642,-1,3,1231534,302,ufd,2.6,4.2,"[""Mel Gibson"", ""Sophie Marceau"", ""Catherine Mc...",-1,"[""anglais""]",88091,9,158501.500000
4,soleil trompeur,1994-08-31,1994,,russie,9120,comedie dramatique,-1,101953,-1,7,529790,129,,-1.0,3.8,"[""Nikita Mikhalkov"", ""Oleg Menshikov"", ""Ingebo...",-1,"[""francais"", ""russe""]",-1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3627,george de la jungle,1997-10-15,1997,,etatsunis,5520,film familial,-1,129374,-1,4,587914,248,,-1.0,2.1,"[""Brendan Fraser"", ""Leslie Mann"", ""Richard Rou...",-1,"[""anglais"", ""espagnol""]",93126,0,
3628,terrain mine,1994-04-06,1994,,etatsunis,6060,aventure action,-1,186581,-1,4,382000,264,warner bros france,-1.0,2.1,"[""Fran Monegan"", ""Gabriel L. Muktoyuk"", ""Helen...",-1,"[""anglais""]",85264,0,
3629,rasta rockett,1994-04-13,1993,jon turteltaub,etatsunis,6480,film familial,-1,227393,-1,3,2523167,123,gaumont buena vista international gbvi,3.0,3.8,"[""Leon Robinson"", ""Doug E. Doug"", ""John Candy""...",-1,"[""anglais"", ""allemand"", ""russe""]",84550,0,49496.111111
3630,be happy,2008-08-27,2008,mike leigh,grandebretagne,7080,comedie,18126,110442,90372,9,362884,128,mk2 diffusion,3.9,2.7,"[""Sally Hawkins"", ""Alexis Zegerman"", ""Andrea R...",-1,"[""anglais""]",121057,1,123983.750000


In [10]:
df['entree_annee'] = df['year']


def classify_entrees_year(entrees):
    if entrees == 1986:
        return 168.1
    elif entrees == 1992:
        return 116.0
    elif entrees == 1993:
        return 132.7
    elif entrees == 1994:
        return 124.4
    elif entrees == 1995:
        return 130.2
    elif entrees == 1996:
        return 136.7
    elif entrees == 1997:
        return 149.3
    elif entrees == 1998:
        return 170.6
    elif entrees == 1999:
        return 153.6
    elif entrees == 2000:
        return 165.8
    elif entrees == 2001:
        return 187.5
    elif entrees == 2002:
        return 184.4
    elif entrees == 2003:
        return 173.5
    elif entrees == 2004:
        return 195.8
    elif entrees == 2005:
        return 175.6
    elif entrees == 2006:
        return 188.8
    elif entrees == 2007:
        return 178.5
    elif entrees == 2008:
        return 190.3
    elif entrees == 2009:
        return 201.6
    elif entrees == 2010:
        return 207.1
    elif entrees == 2011:
        return 217.2
    elif entrees == 2012:
        return 203.6
    elif entrees == 2013:
        return 193.7
    elif entrees == 2014:
        return 209.1
    elif entrees == 2015:
        return 205.4
    elif entrees == 2016:
        return 213.2
    elif entrees == 2017:
        return 209.4
    elif entrees == 2018:
        return 201.2
    elif entrees == 2019:
        return 213.2
    elif entrees == 2020:
        return 65.3
    elif entrees == 2021:
        return 95.5
    elif entrees == 2022:
        return 	152.0
    elif entrees == 2023 or 2024:
        return 180.8



df['entree_annee'] = df['entree_annee'].apply(classify_entrees_year)

df

Unnamed: 0,title,date,year,director,country,duration,genre,first_day,first_week,first_weekend,hebdo_rank,total_spectator,copies,distributor,rating_press,rating_public,casting,budget,lang,visa,award,director_score,entree_annee
0,rien ne va plus,1997-10-15,1997,claude chabrol,france,6300,thriller,-1,172230,-1,3,458125,234,Carlotta Films,-1.0,3.0,"[""Michel Serrault"", ""Isabelle Huppert"", ""Franç...",-1,"[""francais"", ""hongrois""]",90016,0,16565.500000,149.3
1,la fille sur le pont,1999-03-31,1999,patrice leconte,france,5400,drame,-1,154881,-1,5,663390,198,,3.4,3.6,"[""Vanessa Paradis"", ""Daniel Auteuil"", ""Claude ...",-1,"[""francais""]",90876,1,7282.357143,153.6
2,grosse fatigue,1994-05-18,1994,michel blanc,france,5220,comedie,-1,417021,-1,2,2015230,189,,-1.0,2.7,"[""Michel Blanc"", ""Philippe Noiret"", ""Marie-Ann...",-1,"[""francais""]",82266,1,32631.750000,124.4
3,braveheart,1995-10-04,1995,mel gibson,etatsunis,9900,aventure action,-1,355642,-1,3,1231534,302,ufd,2.6,4.2,"[""Mel Gibson"", ""Sophie Marceau"", ""Catherine Mc...",-1,"[""anglais""]",88091,9,158501.500000,130.2
4,soleil trompeur,1994-08-31,1994,,russie,9120,comedie dramatique,-1,101953,-1,7,529790,129,,-1.0,3.8,"[""Nikita Mikhalkov"", ""Oleg Menshikov"", ""Ingebo...",-1,"[""francais"", ""russe""]",-1,0,,124.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3627,george de la jungle,1997-10-15,1997,,etatsunis,5520,film familial,-1,129374,-1,4,587914,248,,-1.0,2.1,"[""Brendan Fraser"", ""Leslie Mann"", ""Richard Rou...",-1,"[""anglais"", ""espagnol""]",93126,0,,149.3
3628,terrain mine,1994-04-06,1994,,etatsunis,6060,aventure action,-1,186581,-1,4,382000,264,warner bros france,-1.0,2.1,"[""Fran Monegan"", ""Gabriel L. Muktoyuk"", ""Helen...",-1,"[""anglais""]",85264,0,,124.4
3629,rasta rockett,1994-04-13,1993,jon turteltaub,etatsunis,6480,film familial,-1,227393,-1,3,2523167,123,gaumont buena vista international gbvi,3.0,3.8,"[""Leon Robinson"", ""Doug E. Doug"", ""John Candy""...",-1,"[""anglais"", ""allemand"", ""russe""]",84550,0,49496.111111,132.7
3630,be happy,2008-08-27,2008,mike leigh,grandebretagne,7080,comedie,18126,110442,90372,9,362884,128,mk2 diffusion,3.9,2.7,"[""Sally Hawkins"", ""Alexis Zegerman"", ""Andrea R...",-1,"[""anglais""]",121057,1,123983.750000,190.3


In [11]:
# import json

# test = df['casting'].unique()[0]

# json.loads(test)

# df["casting"] = df["casting"].apply(json.loads)

In [12]:
acteurs_tres_connus = [
    "Tom Hanks", "Meryl Streep", "Leonardo DiCaprio", "Jennifer Lawrence", "Denzel Washington",
    "Cate Blanchett", "Brad Pitt", "Angelina Jolie", "Johnny Depp", "Julia Roberts",
    "Robert Downey Jr.", "Scarlett Johansson", "Will Smith", "Natalie Portman", "George Clooney",
    "Charlize Theron", "Matt Damon", "Emma Stone", "Nicole Kidman", "Christian Bale",
    "Tom Cruise", "Helen Mirren", "Daniel Day-Lewis", "Sandra Bullock", "Jake Gyllenhaal",
    "Kate Winslet", "Morgan Freeman", "Anne Hathaway", "Sean Penn", "Kate Hudson",
    "Liam Neeson", "Viola Davis", "Mark Wahlberg", "Halle Berry", "Hugh Jackman",
    "Jessica Chastain", "Chris Hemsworth", "Judi Dench", "Ryan Gosling", "Emily Blunt",
    "Eddie Redmayne", "Marion Cotillard", "James Franco", "Amy Adams", "Colin Firth",
    "Reese Witherspoon", "Jude Law", "Gwyneth Paltrow", "Ryan Reynolds", "Julianne Moore",
    "Christoph Waltz", "Rachel McAdams", "Michael Fassbender", "Anne Bancroft", "Javier Bardem",
    "Robin Williams", "Penélope Cruz", "Joaquin Phoenix", "Michelle Williams", "Mark Ruffalo",
    "Naomi Watts", "Steve Carell", "Carey Mulligan", "Kevin Spacey", "Emma Watson",
    "Woody Harrelson", "Diane Keaton", "Anthony Hopkins", "Kristen Stewart", "Jamie Foxx",
    "Alicia Vikander", "Jeremy Renner", "Maggie Smith", "Seth Rogen", "Jodie Foster",
    "Daniel Radcliffe", "Saoirse Ronan", "Ralph Fiennes", "Michelle Pfeiffer", "Jeff Bridges",
    "Salma Hayek", "Antonio Banderas", "Uma Thurman", "Donald Sutherland", "Tom Hardy",
    "Kirsten Dunst", "John Travolta", "Anne Hathaway", "Gary Oldman", "Sigourney Weaver",
    "Joaquin Phoenix", "Catherine Zeta-Jones", "Emma Thompson", "Paul Rudd", "Diane Lane",
    "John Malkovich", "Eva Green", "Colin Farrell", "Angelina Jolie", "Bradley Cooper"
]

acteurs_moins_connus = [
    "Bill Skarsgård", "Florence Pugh", "Timothée Chalamet", "Elle Fanning", "Robert Pattinson",
    "Amanda Seyfried", "Dev Patel", "Eiza González", "John David Washington", "Lily James",
    "Domhnall Gleeson", "Gemma Chan", "Nicholas Hoult", "Ana de Armas", "Timothee Chalamet",
    "Saoirse Ronan", "Anya Taylor-Joy", "Shia LaBeouf", "Mackenzie Davis", "Jessie Buckley",
    "Callum Turner", "Thomasin McKenzie", "Kelvin Harrison Jr.", "Henry Golding", "Kaitlyn Dever",
    "Billy Magnussen", "Amandla Stenberg", "Tessa Thompson", "Beanie Feldstein", "Lakeith Stanfield",
    "Olivia Cooke", "Billie Lourd", "Noah Centineo", "Yara Shahidi", "Margaret Qualley", "Riz Ahmed",
    "Letitia Wright", "Jack Lowden", "Kelvin Harrison Jr.", "Naomi Scott", "Barry Keoghan",
    "Taylor Russell", "Timothée Chalamet", "Elsie Fisher", "Joey King", "Awkwafina", "Olivia Cooke",
    "Caleb Landry Jones", "Thomasin McKenzie", "Winston Duke", "Aja Naomi King", "Noah Jupe",
    "Sophia Lillis", "Lucas Hedges", "Zendaya", "Billie Eilish", "Harris Dickinson", "Katherine Langford",
    "Nick Robinson", "Millie Bobby Brown", "Beanie Feldstein", "Noah Centineo", "Finn Wolfhard",
    "Zoey Deutch", "Anthony Ramos", "Sydney Sweeney", "Keeley Hawes", "Ross Lynch", "Euphoria",
    "Sarah Gadon", "Noah Schnapp", "Jacob Elordi", "Liana Liberato", "Shameik Moore", "Elle Fanning",
    "Kaitlyn Dever", "Sadie Sink", "Lia Marie Johnson", "Fionn Whitehead", "Katherine Waterston",
    "Nicholas Hoult", "Odeya Rush", "Kiernan Shipka", "Lucas Till", "Halston Sage", "Tom Holland",
    "Maude Apatow", "Thomas Mann", "Julia Garner", "Charlie Plummer", "Bella Thorne", "Cameron Monaghan",
    "Elle Fanning", "Jack Dylan Grazer", "Beulah Koale", "Harris Dickinson", "Katherine Langford",
    "Thomasin McKenzie", "Eddie Redmayne", "Gaten Matarazzo"
]

acteurs_fr = [
    "Jean Dujardin",
    "Marion Cotillard",
    "Vincent Cassel",
    "Audrey Tautou",
    "Omar Sy",
    "Juliette Binoche",
    "Guillaume Canet",
    "Catherine Deneuve",
    "Gérard Depardieu",
    "Isabelle Huppert",
    "Mélanie Laurent",
    "Louis de Funès",
    "Emmanuelle Béart",
    "François Cluzet",
    "Léa Seydoux",
    "Jean Reno",
    "Sophie Marceau",
    "Mathieu Amalric",
    "Vincent Lindon",
    "Bérénice Bejo",
    "Kad Merad",
    "Monica Bellucci",
    "Romain Duris",
    "Sandrine Kiberlain",
    "André Dussollier",
    "Audrey Fleurot",
    "Dany Boon",
    "Kristin Scott Thomas",
    "Fabrice Luchini",
    "Charlotte Gainsbourg",
    "Jean-Paul Belmondo",
    "Mélanie Doutey",
    "François Berléand",
    "Emmanuelle Devos",
    "Jean-Pierre Bacri",
    "Emmanuelle Seigner",
    "Lambert Wilson",
    "Valérie Lemercier",
    "Karin Viard",
    "Michel Blanc",
    "Isabelle Carré",
    "Vincent Perez",
    "Ludivine Sagnier",
    "Jean-Hugues Anglade",
    "Marina Foïs",
    "Gilles Lellouche",
    "Sara Forestier",
    "Daniel Auteuil",
    "Chiara Mastroianni",
    "Josiane Balasko"
]


In [13]:
# def classifier_acteurs(acteurs):
#     # print(type(acteurs))
#     result = 0
#     for acteur in acteurs:
#         # print(acteur,acteurs)
#         if acteur in acteurs_tres_connus:
#             result += 10
#         elif acteur in acteurs_fr:
#             result += 7
#         elif acteur in acteurs_moins_connus:
#             result += 5
#         else :
#             result += 0
#     return result
def classifier_acteurs(acteurs):
    for acteur in acteurs:
        if acteur in acteurs_tres_connus:
            return 3
        elif acteur in acteurs_fr:
            return 2
        elif acteur in acteurs_moins_connus:
            return 1
        else :
            return 0

df['classification_acteurs'] = df['casting'].apply(lambda x: classifier_acteurs(eval(x)))

df


Unnamed: 0,title,date,year,director,country,duration,genre,first_day,first_week,first_weekend,hebdo_rank,total_spectator,copies,distributor,rating_press,rating_public,casting,budget,lang,visa,award,director_score,entree_annee,classification_acteurs
0,rien ne va plus,1997-10-15,1997,claude chabrol,france,6300,thriller,-1,172230,-1,3,458125,234,Carlotta Films,-1.0,3.0,"[""Michel Serrault"", ""Isabelle Huppert"", ""Franç...",-1,"[""francais"", ""hongrois""]",90016,0,16565.500000,149.3,0.0
1,la fille sur le pont,1999-03-31,1999,patrice leconte,france,5400,drame,-1,154881,-1,5,663390,198,,3.4,3.6,"[""Vanessa Paradis"", ""Daniel Auteuil"", ""Claude ...",-1,"[""francais""]",90876,1,7282.357143,153.6,0.0
2,grosse fatigue,1994-05-18,1994,michel blanc,france,5220,comedie,-1,417021,-1,2,2015230,189,,-1.0,2.7,"[""Michel Blanc"", ""Philippe Noiret"", ""Marie-Ann...",-1,"[""francais""]",82266,1,32631.750000,124.4,2.0
3,braveheart,1995-10-04,1995,mel gibson,etatsunis,9900,aventure action,-1,355642,-1,3,1231534,302,ufd,2.6,4.2,"[""Mel Gibson"", ""Sophie Marceau"", ""Catherine Mc...",-1,"[""anglais""]",88091,9,158501.500000,130.2,0.0
4,soleil trompeur,1994-08-31,1994,,russie,9120,comedie dramatique,-1,101953,-1,7,529790,129,,-1.0,3.8,"[""Nikita Mikhalkov"", ""Oleg Menshikov"", ""Ingebo...",-1,"[""francais"", ""russe""]",-1,0,,124.4,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3627,george de la jungle,1997-10-15,1997,,etatsunis,5520,film familial,-1,129374,-1,4,587914,248,,-1.0,2.1,"[""Brendan Fraser"", ""Leslie Mann"", ""Richard Rou...",-1,"[""anglais"", ""espagnol""]",93126,0,,149.3,0.0
3628,terrain mine,1994-04-06,1994,,etatsunis,6060,aventure action,-1,186581,-1,4,382000,264,warner bros france,-1.0,2.1,"[""Fran Monegan"", ""Gabriel L. Muktoyuk"", ""Helen...",-1,"[""anglais""]",85264,0,,124.4,0.0
3629,rasta rockett,1994-04-13,1993,jon turteltaub,etatsunis,6480,film familial,-1,227393,-1,3,2523167,123,gaumont buena vista international gbvi,3.0,3.8,"[""Leon Robinson"", ""Doug E. Doug"", ""John Candy""...",-1,"[""anglais"", ""allemand"", ""russe""]",84550,0,49496.111111,132.7,0.0
3630,be happy,2008-08-27,2008,mike leigh,grandebretagne,7080,comedie,18126,110442,90372,9,362884,128,mk2 diffusion,3.9,2.7,"[""Sally Hawkins"", ""Alexis Zegerman"", ""Andrea R...",-1,"[""anglais""]",121057,1,123983.750000,190.3,0.0


In [14]:
def classifier_pays(pays):
    if pays == 'etatsunis':
        return 3
    elif pays == 'france':
        return 2
    else :
        return 0

df['classification_country'] = df['country'].apply(lambda x: classifier_pays(x))

df

Unnamed: 0,title,date,year,director,country,duration,genre,first_day,first_week,first_weekend,hebdo_rank,total_spectator,copies,distributor,rating_press,rating_public,casting,budget,lang,visa,award,director_score,entree_annee,classification_acteurs,classification_country
0,rien ne va plus,1997-10-15,1997,claude chabrol,france,6300,thriller,-1,172230,-1,3,458125,234,Carlotta Films,-1.0,3.0,"[""Michel Serrault"", ""Isabelle Huppert"", ""Franç...",-1,"[""francais"", ""hongrois""]",90016,0,16565.500000,149.3,0.0,2
1,la fille sur le pont,1999-03-31,1999,patrice leconte,france,5400,drame,-1,154881,-1,5,663390,198,,3.4,3.6,"[""Vanessa Paradis"", ""Daniel Auteuil"", ""Claude ...",-1,"[""francais""]",90876,1,7282.357143,153.6,0.0,2
2,grosse fatigue,1994-05-18,1994,michel blanc,france,5220,comedie,-1,417021,-1,2,2015230,189,,-1.0,2.7,"[""Michel Blanc"", ""Philippe Noiret"", ""Marie-Ann...",-1,"[""francais""]",82266,1,32631.750000,124.4,2.0,2
3,braveheart,1995-10-04,1995,mel gibson,etatsunis,9900,aventure action,-1,355642,-1,3,1231534,302,ufd,2.6,4.2,"[""Mel Gibson"", ""Sophie Marceau"", ""Catherine Mc...",-1,"[""anglais""]",88091,9,158501.500000,130.2,0.0,3
4,soleil trompeur,1994-08-31,1994,,russie,9120,comedie dramatique,-1,101953,-1,7,529790,129,,-1.0,3.8,"[""Nikita Mikhalkov"", ""Oleg Menshikov"", ""Ingebo...",-1,"[""francais"", ""russe""]",-1,0,,124.4,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3627,george de la jungle,1997-10-15,1997,,etatsunis,5520,film familial,-1,129374,-1,4,587914,248,,-1.0,2.1,"[""Brendan Fraser"", ""Leslie Mann"", ""Richard Rou...",-1,"[""anglais"", ""espagnol""]",93126,0,,149.3,0.0,3
3628,terrain mine,1994-04-06,1994,,etatsunis,6060,aventure action,-1,186581,-1,4,382000,264,warner bros france,-1.0,2.1,"[""Fran Monegan"", ""Gabriel L. Muktoyuk"", ""Helen...",-1,"[""anglais""]",85264,0,,124.4,0.0,3
3629,rasta rockett,1994-04-13,1993,jon turteltaub,etatsunis,6480,film familial,-1,227393,-1,3,2523167,123,gaumont buena vista international gbvi,3.0,3.8,"[""Leon Robinson"", ""Doug E. Doug"", ""John Candy""...",-1,"[""anglais"", ""allemand"", ""russe""]",84550,0,49496.111111,132.7,0.0,3
3630,be happy,2008-08-27,2008,mike leigh,grandebretagne,7080,comedie,18126,110442,90372,9,362884,128,mk2 diffusion,3.9,2.7,"[""Sally Hawkins"", ""Alexis Zegerman"", ""Andrea R...",-1,"[""anglais""]",121057,1,123983.750000,190.3,0.0,0


In [15]:
# Studios très connus
distributeurs_tres_connus = [
    'United International Pictures (UIP)',
    'Gaumont Buena Vista International (GBVI)',
    'Sony Pictures Releasing France',
    'Warner Bros. France',
    'The Walt Disney Company France',
    'Twentieth Century Fox France',
    'Columbia TriStar Films',
    '20th Century Studios',
    'Universal Pictures International France',
    'Pathé',
    'StudioCanal',
    'EuropaCorp Distribution',
    'Paramount Pictures France',
    'Universal Pictures France (UPF)'
]

# Studios moins connus
distributeurs_moins_connus = [
    'MK2 Diffusion',
    'Gaumont Distribution',
    'PolyGram Film Distribution',
    'Sony Pictures Home Entertainment',
    'Carlotta Films',
    'Bac Films',
    'Tamasa Distribution',
    'Mars Distribution',
    'Diaphana Films',
    'Wild Bunch Distribution',
    'Metropolitan FilmExport',
    'ARP Sélection',
    'SND',
    'Haut et Court',
    'Memento Distribution',
    'Le Pacte',
    'Ad Vitam',
    'Happiness Distribution',
    'Les Films du Losange',
    'Rezo Films',
    'Gebeka Films',
    'Les Films Number One',
    'Mary-X Distribution',
    'ARP / UGC',
    'Colifilms Diffusion',
    'Splendor Films',
    'AMLF',
    'UFD'
]




In [16]:
def classifier_distributeurs(distributeur):
    if distributeur in distributeurs_tres_connus:
        return 1
    elif distributeur in distributeurs_moins_connus:
        return 2
    else:
        return 3

df['distributor'] = df['distributor'].apply(classifier_distributeurs)

df

Unnamed: 0,title,date,year,director,country,duration,genre,first_day,first_week,first_weekend,hebdo_rank,total_spectator,copies,distributor,rating_press,rating_public,casting,budget,lang,visa,award,director_score,entree_annee,classification_acteurs,classification_country
0,rien ne va plus,1997-10-15,1997,claude chabrol,france,6300,thriller,-1,172230,-1,3,458125,234,2,-1.0,3.0,"[""Michel Serrault"", ""Isabelle Huppert"", ""Franç...",-1,"[""francais"", ""hongrois""]",90016,0,16565.500000,149.3,0.0,2
1,la fille sur le pont,1999-03-31,1999,patrice leconte,france,5400,drame,-1,154881,-1,5,663390,198,3,3.4,3.6,"[""Vanessa Paradis"", ""Daniel Auteuil"", ""Claude ...",-1,"[""francais""]",90876,1,7282.357143,153.6,0.0,2
2,grosse fatigue,1994-05-18,1994,michel blanc,france,5220,comedie,-1,417021,-1,2,2015230,189,3,-1.0,2.7,"[""Michel Blanc"", ""Philippe Noiret"", ""Marie-Ann...",-1,"[""francais""]",82266,1,32631.750000,124.4,2.0,2
3,braveheart,1995-10-04,1995,mel gibson,etatsunis,9900,aventure action,-1,355642,-1,3,1231534,302,3,2.6,4.2,"[""Mel Gibson"", ""Sophie Marceau"", ""Catherine Mc...",-1,"[""anglais""]",88091,9,158501.500000,130.2,0.0,3
4,soleil trompeur,1994-08-31,1994,,russie,9120,comedie dramatique,-1,101953,-1,7,529790,129,3,-1.0,3.8,"[""Nikita Mikhalkov"", ""Oleg Menshikov"", ""Ingebo...",-1,"[""francais"", ""russe""]",-1,0,,124.4,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3627,george de la jungle,1997-10-15,1997,,etatsunis,5520,film familial,-1,129374,-1,4,587914,248,3,-1.0,2.1,"[""Brendan Fraser"", ""Leslie Mann"", ""Richard Rou...",-1,"[""anglais"", ""espagnol""]",93126,0,,149.3,0.0,3
3628,terrain mine,1994-04-06,1994,,etatsunis,6060,aventure action,-1,186581,-1,4,382000,264,3,-1.0,2.1,"[""Fran Monegan"", ""Gabriel L. Muktoyuk"", ""Helen...",-1,"[""anglais""]",85264,0,,124.4,0.0,3
3629,rasta rockett,1994-04-13,1993,jon turteltaub,etatsunis,6480,film familial,-1,227393,-1,3,2523167,123,3,3.0,3.8,"[""Leon Robinson"", ""Doug E. Doug"", ""John Candy""...",-1,"[""anglais"", ""allemand"", ""russe""]",84550,0,49496.111111,132.7,0.0,3
3630,be happy,2008-08-27,2008,mike leigh,grandebretagne,7080,comedie,18126,110442,90372,9,362884,128,3,3.9,2.7,"[""Sally Hawkins"", ""Alexis Zegerman"", ""Andrea R...",-1,"[""anglais""]",121057,1,123983.750000,190.3,0.0,0


In [17]:
df["classification_acteurs"].nunique()

4

In [18]:
df.isnull().sum()

title                        0
date                         0
year                         0
director                  1923
country                      0
duration                     0
genre                        0
first_day                    0
first_week                   0
first_weekend                0
hebdo_rank                   0
total_spectator              0
copies                       0
distributor                  0
rating_press                 0
rating_public                0
casting                      0
budget                       0
lang                         0
visa                         0
award                        0
director_score            1923
entree_annee                 0
classification_acteurs     320
classification_country       0
dtype: int64

In [19]:
colonnes_numeriques = ["first_day", 'entree_annee',"rating_press", "rating_public", "first_weekend",'budget']

df[colonnes_numeriques] = df[colonnes_numeriques].replace(-1, np.nan)

imputer = KNNImputer(n_neighbors=7)

df_imputed = imputer.fit_transform(df[colonnes_numeriques])

df[colonnes_numeriques] = df_imputed

df.head()

Unnamed: 0,title,date,year,director,country,duration,genre,first_day,first_week,first_weekend,hebdo_rank,total_spectator,copies,distributor,rating_press,rating_public,casting,budget,lang,visa,award,director_score,entree_annee,classification_acteurs,classification_country
0,rien ne va plus,1997-10-15,1997,claude chabrol,france,6300,thriller,124740.0,172230,854213.428571,3,458125,234,2,2.8,3.0,"[""Michel Serrault"", ""Isabelle Huppert"", ""Franç...",806714300.0,"[""francais"", ""hongrois""]",90016,0,16565.5,149.3,0.0,2
1,la fille sur le pont,1999-03-31,1999,patrice leconte,france,5400,drame,119588.571429,154881,839429.857143,5,663390,198,3,3.4,3.6,"[""Vanessa Paradis"", ""Daniel Auteuil"", ""Claude ...",1416571000.0,"[""francais""]",90876,1,7282.357143,153.6,0.0,2
2,grosse fatigue,1994-05-18,1994,michel blanc,france,5220,comedie,102043.714286,417021,324750.428571,2,2015230,189,3,2.871429,2.7,"[""Michel Blanc"", ""Philippe Noiret"", ""Marie-Ann...",32071430.0,"[""francais""]",82266,1,32631.75,124.4,2.0,2
3,braveheart,1995-10-04,1995,mel gibson,etatsunis,9900,aventure action,100565.0,355642,482844.714286,3,1231534,302,3,2.6,4.2,"[""Mel Gibson"", ""Sophie Marceau"", ""Catherine Mc...",217214300.0,"[""anglais""]",88091,9,158501.5,130.2,0.0,3
4,soleil trompeur,1994-08-31,1994,,russie,9120,comedie dramatique,113030.857143,101953,452696.142857,7,529790,129,3,3.228571,3.8,"[""Nikita Mikhalkov"", ""Oleg Menshikov"", ""Ingebo...",32785710.0,"[""francais"", ""russe""]",-1,0,,124.4,0.0,0


In [20]:
df.isnull().sum()

title                        0
date                         0
year                         0
director                  1923
country                      0
duration                     0
genre                        0
first_day                    0
first_week                   0
first_weekend                0
hebdo_rank                   0
total_spectator              0
copies                       0
distributor                  0
rating_press                 0
rating_public                0
casting                      0
budget                       0
lang                         0
visa                         0
award                        0
director_score            1923
entree_annee                 0
classification_acteurs     320
classification_country       0
dtype: int64

In [21]:
df.nunique()

title                     3630
date                      1482
year                        34
director                   351
country                     33
duration                   119
genre                       18
first_day                 3249
first_week                3611
first_weekend             3086
hebdo_rank                  15
total_spectator           3624
copies                     757
distributor                  3
rating_press               159
rating_public               63
casting                   3290
budget                     624
lang                       223
visa                      2964
award                       27
director_score             351
entree_annee                32
classification_acteurs       4
classification_country       3
dtype: int64

In [22]:
df['date'] = pd.to_datetime(df['date'])

df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

df.drop("date", axis=1, inplace=True)
df

Unnamed: 0,title,year,director,country,duration,genre,first_day,first_week,first_weekend,hebdo_rank,total_spectator,copies,distributor,rating_press,rating_public,casting,budget,lang,visa,award,director_score,entree_annee,classification_acteurs,classification_country,day,month
0,rien ne va plus,1997,claude chabrol,france,6300,thriller,124740.000000,172230,854213.428571,3,458125,234,2,2.800000,3.0,"[""Michel Serrault"", ""Isabelle Huppert"", ""Franç...",8.067143e+08,"[""francais"", ""hongrois""]",90016,0,16565.500000,149.3,0.0,2,15,10
1,la fille sur le pont,1999,patrice leconte,france,5400,drame,119588.571429,154881,839429.857143,5,663390,198,3,3.400000,3.6,"[""Vanessa Paradis"", ""Daniel Auteuil"", ""Claude ...",1.416571e+09,"[""francais""]",90876,1,7282.357143,153.6,0.0,2,31,3
2,grosse fatigue,1994,michel blanc,france,5220,comedie,102043.714286,417021,324750.428571,2,2015230,189,3,2.871429,2.7,"[""Michel Blanc"", ""Philippe Noiret"", ""Marie-Ann...",3.207143e+07,"[""francais""]",82266,1,32631.750000,124.4,2.0,2,18,5
3,braveheart,1995,mel gibson,etatsunis,9900,aventure action,100565.000000,355642,482844.714286,3,1231534,302,3,2.600000,4.2,"[""Mel Gibson"", ""Sophie Marceau"", ""Catherine Mc...",2.172143e+08,"[""anglais""]",88091,9,158501.500000,130.2,0.0,3,4,10
4,soleil trompeur,1994,,russie,9120,comedie dramatique,113030.857143,101953,452696.142857,7,529790,129,3,3.228571,3.8,"[""Nikita Mikhalkov"", ""Oleg Menshikov"", ""Ingebo...",3.278571e+07,"[""francais"", ""russe""]",-1,0,,124.4,0.0,0,31,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3627,george de la jungle,1997,,etatsunis,5520,film familial,133909.000000,129374,654218.142857,4,587914,248,3,3.142857,2.1,"[""Brendan Fraser"", ""Leslie Mann"", ""Richard Rou...",1.167714e+09,"[""anglais"", ""espagnol""]",93126,0,,149.3,0.0,3,15,10
3628,terrain mine,1994,,etatsunis,6060,aventure action,112384.285714,186581,332572.857143,4,382000,264,3,2.871429,2.1,"[""Fran Monegan"", ""Gabriel L. Muktoyuk"", ""Helen...",3.850000e+07,"[""anglais""]",85264,0,,124.4,0.0,3,6,4
3629,rasta rockett,1994,jon turteltaub,etatsunis,6480,film familial,105138.571429,227393,683052.428571,3,2523167,123,3,3.000000,3.8,"[""Leon Robinson"", ""Doug E. Doug"", ""John Candy""...",2.013571e+08,"[""anglais"", ""allemand"", ""russe""]",84550,0,49496.111111,132.7,0.0,3,13,4
3630,be happy,2008,mike leigh,grandebretagne,7080,comedie,18126.000000,110442,90372.000000,9,362884,128,3,3.900000,2.7,"[""Sally Hawkins"", ""Alexis Zegerman"", ""Andrea R...",4.137286e+07,"[""anglais""]",121057,1,123983.750000,190.3,0.0,0,27,8


In [23]:
def classify_director(month):
    if month == 12 or 1 or 2:
        return 'winter'
    elif month == 3 or 4 or 5:
        return 'spring'
    elif month == 6 or 7 or 8:
        return 'summer'
    elif month == 9 or 10 or 11:
        return 'autumn'


df['season'] = df['month'].apply(classify_director)


In [24]:
holiday_dates = SchoolHolidayDates()

df['is_holiday'] = df.apply(lambda row: holiday_dates.is_holiday_for_zone(datetime.date(row['year'], row['month'], row['day']), 'B'), axis=1)
df['is_holiday'] = df['is_holiday'].astype(int)
df

Unnamed: 0,title,year,director,country,duration,genre,first_day,first_week,first_weekend,hebdo_rank,total_spectator,copies,distributor,rating_press,rating_public,casting,budget,lang,visa,award,director_score,entree_annee,classification_acteurs,classification_country,day,month,season,is_holiday
0,rien ne va plus,1997,claude chabrol,france,6300,thriller,124740.000000,172230,854213.428571,3,458125,234,2,2.800000,3.0,"[""Michel Serrault"", ""Isabelle Huppert"", ""Franç...",8.067143e+08,"[""francais"", ""hongrois""]",90016,0,16565.500000,149.3,0.0,2,15,10,winter,0
1,la fille sur le pont,1999,patrice leconte,france,5400,drame,119588.571429,154881,839429.857143,5,663390,198,3,3.400000,3.6,"[""Vanessa Paradis"", ""Daniel Auteuil"", ""Claude ...",1.416571e+09,"[""francais""]",90876,1,7282.357143,153.6,0.0,2,31,3,winter,0
2,grosse fatigue,1994,michel blanc,france,5220,comedie,102043.714286,417021,324750.428571,2,2015230,189,3,2.871429,2.7,"[""Michel Blanc"", ""Philippe Noiret"", ""Marie-Ann...",3.207143e+07,"[""francais""]",82266,1,32631.750000,124.4,2.0,2,18,5,winter,0
3,braveheart,1995,mel gibson,etatsunis,9900,aventure action,100565.000000,355642,482844.714286,3,1231534,302,3,2.600000,4.2,"[""Mel Gibson"", ""Sophie Marceau"", ""Catherine Mc...",2.172143e+08,"[""anglais""]",88091,9,158501.500000,130.2,0.0,3,4,10,winter,0
4,soleil trompeur,1994,,russie,9120,comedie dramatique,113030.857143,101953,452696.142857,7,529790,129,3,3.228571,3.8,"[""Nikita Mikhalkov"", ""Oleg Menshikov"", ""Ingebo...",3.278571e+07,"[""francais"", ""russe""]",-1,0,,124.4,0.0,0,31,8,winter,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3627,george de la jungle,1997,,etatsunis,5520,film familial,133909.000000,129374,654218.142857,4,587914,248,3,3.142857,2.1,"[""Brendan Fraser"", ""Leslie Mann"", ""Richard Rou...",1.167714e+09,"[""anglais"", ""espagnol""]",93126,0,,149.3,0.0,3,15,10,winter,0
3628,terrain mine,1994,,etatsunis,6060,aventure action,112384.285714,186581,332572.857143,4,382000,264,3,2.871429,2.1,"[""Fran Monegan"", ""Gabriel L. Muktoyuk"", ""Helen...",3.850000e+07,"[""anglais""]",85264,0,,124.4,0.0,3,6,4,winter,0
3629,rasta rockett,1994,jon turteltaub,etatsunis,6480,film familial,105138.571429,227393,683052.428571,3,2523167,123,3,3.000000,3.8,"[""Leon Robinson"", ""Doug E. Doug"", ""John Candy""...",2.013571e+08,"[""anglais"", ""allemand"", ""russe""]",84550,0,49496.111111,132.7,0.0,3,13,4,winter,0
3630,be happy,2008,mike leigh,grandebretagne,7080,comedie,18126.000000,110442,90372.000000,9,362884,128,3,3.900000,2.7,"[""Sally Hawkins"", ""Alexis Zegerman"", ""Andrea R...",4.137286e+07,"[""anglais""]",121057,1,123983.750000,190.3,0.0,0,27,8,winter,1


In [25]:
df.to_csv('Dataset_analyse.csv', index=False)
df

Unnamed: 0,title,year,director,country,duration,genre,first_day,first_week,first_weekend,hebdo_rank,total_spectator,copies,distributor,rating_press,rating_public,casting,budget,lang,visa,award,director_score,entree_annee,classification_acteurs,classification_country,day,month,season,is_holiday
0,rien ne va plus,1997,claude chabrol,france,6300,thriller,124740.000000,172230,854213.428571,3,458125,234,2,2.800000,3.0,"[""Michel Serrault"", ""Isabelle Huppert"", ""Franç...",8.067143e+08,"[""francais"", ""hongrois""]",90016,0,16565.500000,149.3,0.0,2,15,10,winter,0
1,la fille sur le pont,1999,patrice leconte,france,5400,drame,119588.571429,154881,839429.857143,5,663390,198,3,3.400000,3.6,"[""Vanessa Paradis"", ""Daniel Auteuil"", ""Claude ...",1.416571e+09,"[""francais""]",90876,1,7282.357143,153.6,0.0,2,31,3,winter,0
2,grosse fatigue,1994,michel blanc,france,5220,comedie,102043.714286,417021,324750.428571,2,2015230,189,3,2.871429,2.7,"[""Michel Blanc"", ""Philippe Noiret"", ""Marie-Ann...",3.207143e+07,"[""francais""]",82266,1,32631.750000,124.4,2.0,2,18,5,winter,0
3,braveheart,1995,mel gibson,etatsunis,9900,aventure action,100565.000000,355642,482844.714286,3,1231534,302,3,2.600000,4.2,"[""Mel Gibson"", ""Sophie Marceau"", ""Catherine Mc...",2.172143e+08,"[""anglais""]",88091,9,158501.500000,130.2,0.0,3,4,10,winter,0
4,soleil trompeur,1994,,russie,9120,comedie dramatique,113030.857143,101953,452696.142857,7,529790,129,3,3.228571,3.8,"[""Nikita Mikhalkov"", ""Oleg Menshikov"", ""Ingebo...",3.278571e+07,"[""francais"", ""russe""]",-1,0,,124.4,0.0,0,31,8,winter,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3627,george de la jungle,1997,,etatsunis,5520,film familial,133909.000000,129374,654218.142857,4,587914,248,3,3.142857,2.1,"[""Brendan Fraser"", ""Leslie Mann"", ""Richard Rou...",1.167714e+09,"[""anglais"", ""espagnol""]",93126,0,,149.3,0.0,3,15,10,winter,0
3628,terrain mine,1994,,etatsunis,6060,aventure action,112384.285714,186581,332572.857143,4,382000,264,3,2.871429,2.1,"[""Fran Monegan"", ""Gabriel L. Muktoyuk"", ""Helen...",3.850000e+07,"[""anglais""]",85264,0,,124.4,0.0,3,6,4,winter,0
3629,rasta rockett,1994,jon turteltaub,etatsunis,6480,film familial,105138.571429,227393,683052.428571,3,2523167,123,3,3.000000,3.8,"[""Leon Robinson"", ""Doug E. Doug"", ""John Candy""...",2.013571e+08,"[""anglais"", ""allemand"", ""russe""]",84550,0,49496.111111,132.7,0.0,3,13,4,winter,0
3630,be happy,2008,mike leigh,grandebretagne,7080,comedie,18126.000000,110442,90372.000000,9,362884,128,3,3.900000,2.7,"[""Sally Hawkins"", ""Alexis Zegerman"", ""Andrea R...",4.137286e+07,"[""anglais""]",121057,1,123983.750000,190.3,0.0,0,27,8,winter,1


In [26]:
df.isnull().sum()

title                        0
year                         0
director                  1923
country                      0
duration                     0
genre                        0
first_day                    0
first_week                   0
first_weekend                0
hebdo_rank                   0
total_spectator              0
copies                       0
distributor                  0
rating_press                 0
rating_public                0
casting                      0
budget                       0
lang                         0
visa                         0
award                        0
director_score            1923
entree_annee                 0
classification_acteurs     320
classification_country       0
day                          0
month                        0
season                       0
is_holiday                   0
dtype: int64