In [83]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import xgboost 
from sklearn import set_config
set_config(transform_output="pandas")
pd.set_option('display.max_columns', 500)

In [84]:
from db.database_mysql import engine

df = pd.read_sql_query('''SELECT jp.raw_title, 
        YEAR(jp.date) AS year, 
        MONTH(jp.date) AS month, 
        DAY(jp.date) AS day, 
        im.director, im.date,im.casting, im.distributor, im.genre, jp.country, jp.duration, jp.first_day, jp.first_week, jp.first_weekend, jp.hebdo_rank, 
jp.total_spectator, jp.copies, im.rating_press, im.budget, im.lang, im.award 
FROM films_jp as jp
LEFT JOIN films_imdb im ON im.id_jp = jp.id 
where im.id_jp is not null and im.date = jp.date
order by jp.first_week desc''', engine)

# CREATION DES SCORES

In [85]:
from modelisation.functions import *

clone = df.copy()
calculate_director_scores(clone)
calculate_distributor_scores(clone)
calculate_actor_scores(clone)
calculate_year_scores(clone)
calculate_country_scores(clone)
print()




In [86]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

from modelisation.functions import *

entree_transformer = FunctionTransformer(classify_entrees_year, kw_args={'column' : 'year'})
season_transformer = FunctionTransformer(classify_season,  kw_args={'column' : 'month'})
month_transformer = FunctionTransformer(classify_month_name,  kw_args={'column' : 'month'})
holiday_transformer = FunctionTransformer(is_holiday)
drop_transformer = FunctionTransformer(drop_temp)

pipe_scores = Pipeline([
  ('season_dict' , make_pipeline(season_transformer)),
  ('entree_dict' , make_pipeline(entree_transformer)),
  ('month_dict' , make_pipeline(month_transformer)),
  ('holiday_dict' , make_pipeline(holiday_transformer)),
  #('drop_dict' , make_pipeline(drop_transformer))
])

scores = df.copy()
scores = pipe_scores.fit_transform(scores)
scores.head(1)

Unnamed: 0,raw_title,year,month,day,director,date,casting,distributor,genre,country,duration,first_day,first_week,first_weekend,hebdo_rank,total_spectator,copies,rating_press,budget,lang,award,season,entree_annee,month_name,is_holiday
0,Bienvenue chez les Ch'tis,2008,2,27,"""dany boon""",2008-02-27,"[""kad merad"", ""dany boon"", ""zoe felix""]","[""pathe renn productions"", ""hirsch"", ""les prod...","[""comedie"", ""romantique""]",france,6360,558359,4378720,3586497,1,20489303,793,7.1,11000000,"[""francais""]",5,winter,190.3,february,0


# MODELE

In [87]:
film = pd.read_sql_query('''SELECT 
                              YEAR(date) AS year, 
                              MONTH(date) AS month, 
                              DAY(date) AS day, 
                              director, distributor, casting, copies, duration
                            FROM functionalities_filmscrap
                            LIMIT 1
                         ''', engine)
film

Unnamed: 0,year,month,day,director,distributor,casting,copies,duration
0,2024,4,24,"""david schickele""","[""american film institute afi"", ""bushman co""]","[""paul eyam nzie okpokam"", ""mike slye"", ""elain...",14,4380


In [88]:
import json


def convert_entrees_year(df, column):
  print('convert_entrees_year', df.iloc[0][column])
  scores = load_file('year_scores')
  found = scores.loc[scores[column] == df.iloc[0][column]]
  df['year_combined_score'] = found.iloc[0]['year_combined_score']
  return df
  
def convert_director(df, column):
  print('convert_director',)
  scores = load_file('director_scores')
  try:
    val = []
    for index, row in df.iterrows():
      found = scores.loc[scores[column] == df.iloc[index][column]]
      val.append(found.iloc[0]['director_combined_score'])
  except Exception:
    df['director_combined_score'] = 0
  
  df['director_combined_score'] = pd.Series(val)
  return df

def convert_actor(df, column):
  print('convert_actor')
  scores = load_file('actor_scores')

  
  df['actor_combined_score'] = 0
  
  val = []
  for index, row in df.iterrows():
    df_actors = json.loads(df.iloc[index][column])
    sum = 0
    for actor in df_actors:      
      found = scores.loc[scores['actor'] == actor]
      if found.shape[0] != 0:
        sum += found.iloc[0]['actor_combined_score']

    val.append(sum)

  df['actor_combined_score'] = pd.Series(val)
  return df

def convert_distributor(df, column):
  print('convert_distributor')
  scores = load_file('distributor_scores')

  
  df['distributor_combined_score'] = 0
  
  val = []
  for index, row in df.iterrows():
    df_distributors = json.loads(df.iloc[index][column])
    sum = 0
    for distributor in df_distributors:      
      found = scores.loc[scores['distributor'] == distributor]
      if found.shape[0] != 0:
        sum += found.iloc[0]['distributor_combined_score']

    val.append(sum)

  df['distributor_combined_score'] = pd.Series(val)
  return df


entree_converter = FunctionTransformer(convert_entrees_year, kw_args={'column' : 'year'})
season_converter = FunctionTransformer(classify_season,  kw_args={'column' : 'month'})
month_converter = FunctionTransformer(classify_month_name,  kw_args={'column' : 'month'})
holiday_converter = FunctionTransformer(is_holiday)
director_converter = FunctionTransformer(convert_director, kw_args={'column' : 'director'})
actor_converter = FunctionTransformer(convert_actor, kw_args={'column' : 'casting'})
distributor_converter = FunctionTransformer(convert_distributor, kw_args={'column' : 'distributor'})
drop = FunctionTransformer(drop_temp)

pipe = Pipeline([
  ('season_converter' , make_pipeline(season_converter)),
  ('entree_converter' , make_pipeline(entree_converter)),
  ('month_converter' , make_pipeline(month_converter)),
  ('holiday_converter' , make_pipeline(holiday_converter)),
  ('director_converter' , make_pipeline(director_converter)),
  ('actor_converter' , make_pipeline(actor_converter)),
  ('distributor_converter' , make_pipeline(distributor_converter)),
  #('drop' , make_pipeline(drop)),
])

# t = df.copy()
# pipe.fit_transform(t)['distributor_combined_score']


In [89]:
df_clean = df.copy() #df.drop(columns=['raw_title', 'total_spectator', 'first_day', 'first_weekend', 'hebdo_rank', 'total_spectator', 'rating_press', 'award', 'lang'])

X = df_clean.drop(['first_week'], axis=1)
y = df_clean.first_week

#display(X.info())

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42)

num_col = list(X.select_dtypes(include=[float,int]).columns)
cat_col = list(X.select_dtypes(include=[object]).columns)

onehotscale_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore', sparse_output=False))
scale_pipeline = make_pipeline(RobustScaler(with_centering=False))

preprocessing = ColumnTransformer(
    transformers=[
        #('categorical', onehotscale_pipeline, ['country', 'season', 'month_name']),
        ('numerical', scale_pipeline, ['year_combined_score', 'director_combined_score', 'actor_combined_score', 'distributor_combined_score'])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

polyscale_pipeline = make_pipeline(PolynomialFeatures(2))

model = make_pipeline(
            pipe,
            make_pipeline(drop),
            #preprocessing,
            #xgboost.XGBRegressor(enable_categorical=True, n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
        )

#model.fit(X_train, y_train)
p = model.transform(X_train)
p.head(5)

#xgboost.plot_importance(model[-1], max_num_features=20)


convert_entrees_year 1995
convert_director
convert_actor


IndexError: single positional indexer is out-of-bounds