In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import xgboost 
from sklearn import set_config
set_config(transform_output="pandas")
pd.set_option('display.max_columns', 500)

In [2]:
from db.database_mysql import engine

df = pd.read_sql_query('''SELECT jp.raw_title, 
        YEAR(jp.date) AS year, 
        MONTH(jp.date) AS month, 
        DAY(jp.date) AS day, 
        im.director, im.date,im.casting, im.distributor, im.genre, jp.country, jp.duration, jp.first_day, jp.first_week, jp.first_weekend, jp.hebdo_rank, 
jp.total_spectator, jp.copies, im.rating_press, im.budget, im.lang, im.award 
FROM films_jp as jp
LEFT JOIN films_imdb im ON im.id_jp = jp.id 
where im.id_jp is not null and im.date = jp.date
order by jp.first_week desc''', engine)

# CREATION DES SCORES

In [3]:
from modelisation.functions import *

calculate_director_scores(df)
calculate_distributor_scores(df)
calculate_actor_scores(df)
calculate_year_scores(df)
calculate_country_scores(df)
print()




In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

from modelisation.functions import *

entree_transformer = FunctionTransformer(classify_entrees_year, kw_args={'column' : 'year'})
season_transformer = FunctionTransformer(classify_season,  kw_args={'column' : 'month'})
month_transformer = FunctionTransformer(classify_month_name,  kw_args={'column' : 'month'})
holiday_transformer = FunctionTransformer(is_holiday)
drop_transformer = FunctionTransformer(drop_temp)

pipe_scores = Pipeline([
  ('season_dict' , make_pipeline(season_transformer)),
  ('entree_dict' , make_pipeline(entree_transformer)),
  ('month_dict' , make_pipeline(month_transformer)),
  ('holiday_dict' , make_pipeline(holiday_transformer)),
  ('drop_dict' , make_pipeline(drop_transformer))
])


scores = pipe_scores.fit_transform(df)
scores.head(1)

Unnamed: 0,year,date,genre,country,duration,first_week,copies,season,entree_annee,month_name,is_holiday
0,2008,2008-02-27,"[""comedie"", ""romantique""]",france,6360,4378720,793,winter,190.3,february,0


# MODELE

In [5]:
film = pd.read_sql_query('''SELECT 
                              YEAR(date) AS year, 
                              MONTH(date) AS month, 
                              DAY(date) AS day, 
                              director, distributor, casting, copies, duration
                            FROM functionalities_filmscrap
                            LIMIT 1
                         ''', engine)
film

Unnamed: 0,year,month,day,director,distributor,casting,copies,duration
0,2024,4,24,"""david schickele""","[""american film institute afi"", ""bushman co""]","[""paul eyam nzie okpokam"", ""mike slye"", ""elain...",14,4380


In [6]:
load_file('year_scores').head(2)

Unnamed: 0,year,year_combined_score
0,2008,0.83092
1,2006,0.85577


In [7]:
import json


def convert_entrees_year(df, column):
  scores = load_file('year_scores')
  found = scores.loc[scores[column] == df.iloc[0][column]]
  df['year_combined_score'] = found.iloc[0]['year_combined_score']
  return df
  
def convert_director(df, column):
  scores = load_file('director_scores')

  try:
    found = scores.loc[scores[column] == df.iloc[0][column]]
    df['director_combined_score'] = found.iloc[0]['director']
  except Exception:
    df['director_combined_score'] = 0

  return df

def convert_actor(df, column):
  scores = load_file('actor_scores')

  df_actors = json.loads(df.iloc[0][column])
  df['actor_combined_score'] = 0
  
  for actor in df_actors:
    found = scores.loc[scores['actor'] == actor]
    if found.shape[0] != 0:
      df['actor_combined_score'] += found.iloc[0]['actor_combined_score']

  return df

def convert_distributor(df, column):
  scores = load_file('distributor_scores')
  df_distributors = json.loads(df.iloc[0][column])
  df['distributor_combined_score'] = 0
  
  for distributor in df_distributors:
    found = scores.loc[scores['distributor'] == distributor]
    if found.shape[0] != 0:
      df['distributor_combined_score'] += found.iloc[0]['distributor_combined_score']

  return df

entree_converter = FunctionTransformer(convert_entrees_year, kw_args={'column' : 'year'})
season_converter = FunctionTransformer(classify_season,  kw_args={'column' : 'month'})
month_converter = FunctionTransformer(classify_month_name,  kw_args={'column' : 'month'})
holiday_converter = FunctionTransformer(is_holiday)
director_converter = FunctionTransformer(convert_director, kw_args={'column' : 'director'})
actor_converter = FunctionTransformer(convert_actor, kw_args={'column' : 'casting'})
distributor_converter = FunctionTransformer(convert_distributor, kw_args={'column' : 'distributor'})

pipe = Pipeline([
  ('season_converter' , make_pipeline(season_converter)),
  ('entree_converter' , make_pipeline(entree_converter)),
  ('month_converter' , make_pipeline(month_converter)),
  ('holiday_converter' , make_pipeline(holiday_converter)),
  ('director_converter' , make_pipeline(director_converter)),
  ('actor_converter' , make_pipeline(actor_converter)),
  ('distributor_converter' , make_pipeline(distributor_converter)),
])

#pipe.fit_transform(film)


In [8]:
#df_clean = drop_temp(df)

X = df.drop(['first_week'], axis=1)
y = df.first_week

display(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42)

num_col = list(X.select_dtypes(include=[float,int]).columns)
cat_col = list(X.select_dtypes(include=[object]).columns)

onehotscale_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore', sparse_output=False), RobustScaler(with_centering=False))
scale_pipeline = make_pipeline(RobustScaler(with_centering=False))

preprocessing = ColumnTransformer(
    transformers=[
        ('categorical', onehotscale_pipeline, cat_col),
        ('numerical', scale_pipeline, num_col)]
)

polyscale_pipeline = make_pipeline(PolynomialFeatures(2))

model = make_pipeline(
      preprocessing,
      pipe,
      #xgboost.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
    )

model.fit_transform(X_train)#, y_train)

#xgboost.plot_importance(model[-1], max_num_features=20)


Unnamed: 0,raw_title,year,month,day,director,date,casting,distributor,genre,country,duration,first_day,first_weekend,hebdo_rank,total_spectator,copies,rating_press,budget,lang,award,actor_list,season,entree_annee,month_name,is_holiday
0,Bienvenue chez les Ch'tis,2008,2,27,"""dany boon""",2008-02-27,"[""kad merad"", ""dany boon"", ""zoe felix""]","[""pathe renn productions"", ""hirsch"", ""les prod...","[""comedie"", ""romantique""]",france,6360,558359,3586497,1,20489303,793,7.1,11000000,"[""francais""]",5,"[""kad merad"", ""dany boon"", ""zoe felix""]",winter,190.3,february,0
1,Les Bronzés 3: Amis pour la vie,2006,2,1,"""patrice leconte""",2006-02-01,"[""josiane balasko"", ""michel blanc"", ""marieanne...","[""les films christian fechner"", ""tf1 films pro...","[""comedie""]",france,5820,537882,3235559,1,10355930,950,-1.0,35000000,"[""anglais"", ""italien"", ""francais""]",1,"[""josiane balasko"", ""michel blanc"", ""mariean...",winter,188.8,february,0
2,Taxi 2,2000,3,29,"""gerard krawczyk""",2000-03-29,"[""samy naceri"", ""frederic diefenthal"", ""emma w...","[""arp selection"", ""canal"", ""leeloo productions""]","[""action"", ""comedie"", ""policier""]",france,5400,801922,2951255,1,10345901,831,6.5,70000000,"[""francais"", ""japonais"", ""allemand""]",1,"[""samy naceri"", ""frederic diefenthal"", ""emma...",spring,165.8,march,0
3,La Revanche des Sith,2005,5,18,"""george lucas""",2005-05-18,"[""hayden christensen"", ""natalie portman"", ""ewa...","[""lucasfilm"", ""mestiere cinema"", ""pandora films""]","[""action"", ""aventure"", ""fantastique""]",etatsunis,8760,641799,2878764,1,7247809,938,7.6,113000000,"[""anglais""]",29,"[""hayden christensen"", ""natalie portman"", ""e...",spring,175.6,may,0
4,Le Roi Lion (2019),2019,7,17,"""jon favreau""",2019-07-17,"[""donald glover"", ""beyonce"", ""seth rogen""]","[""walt disney pictures"", ""fairview entertainme...","[""animation"", ""aventure"", ""drame""]",etatsunis,7080,630478,2559370,1,10017995,680,6.8,260000000,"[""anglais"", ""espagnol""]",21,"[""donald glover"", ""beyonce"", ""seth rogen""]",summer,213.2,july,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2668,Simon Werner a disparu,2010,9,22,"""fabrice gobert""",2010-09-22,"[""jules pelissier"", ""ana girardot"", ""audrey ba...","[""247 films"", ""canal"", ""cinecinema""]",,france,5580,6351,40282,15,79945,93,6.4,-1,"[""francais""]",3,"[""jules pelissier"", ""ana girardot"", ""audrey ...",autumn,207.1,september,0
2669,Innocents The Dreamers,2003,12,10,"""bernardo bertolucci""",2003-12-10,"[""michael pitt"", ""louis garrel"", ""eva green""]","[""recorded picture company rpc"", ""fiction"", ""p...",,france,6960,7577,-1,13,79849,-1,7.1,15000000,"[""anglais"", ""francais""]",2,"[""michael pitt"", ""louis garrel"", ""eva green""]",winter,173.5,december,0
2670,La Grande Bellezza,2013,5,22,"""paolo sorrentino""",2013-05-22,"[""toni servillo"", ""carlo verdone"", ""sabrina fe...","[""indigo film"", ""medusa film"", ""babe film""]","[""drame""]",italie,8460,4654,36844,13,198373,81,7.7,9200000,"[""italien"", ""japonais"", ""espagnol"", ""chinois""]",60,"[""toni servillo"", ""carlo verdone"", ""sabrina ...",spring,193.7,may,0
2671,Un baiser papillon,2011,6,1,"""karine silla""",2011-06-01,"[""valeria golino"", ""elsa zylberstein"", ""vincen...","[""europacorp"", ""grive productions"", ""france 2 ...",,france,6060,-1,38189,14,47715,170,5.4,3500000,"[""francais""]",2,"[""valeria golino"", ""elsa zylberstein"", ""vinc...",summer,217.2,june,0


TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['list']