In [65]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from modelisation.functions import load_file, save_files
from db.database_mysql import engine

import xgboost 
from sklearn import set_config
set_config(transform_output="pandas")
pd.set_option('display.max_columns', 500)

In [66]:
def set_score(val, q):
  s = q.loc[val >= q.total].tail(1).index
  return s[0] if s.shape[0] > 0 else 0.25

def calculate(df, col_name, use_col_spectator = 'total'):
  q = df.quantile([0.01, .1, .25, .5, .75, .9, .95], numeric_only=True)

  score = df.copy()
  score[f'{col_name}_combined_score'] = df[use_col_spectator].apply(set_score, q=q)

  save_files(
          score[[col_name, f"{col_name}_combined_score"]],
          f"{col_name}_scores",
      )

# DIRECTOR

In [67]:
df = pd.read_sql_query('''SELECT im.director as director, sum(jp.first_week) as week, sum(jp.total_spectator) as total, count(im.id) as nb_film
FROM films_imdb as im
LEFT JOIN films_jp jp ON jp.id = im.id_jp 
where im.id_jp is not null and im.date = jp.date
group by im.director
order by total desc''', engine)

calculate(df, 'director')

# ACTOR

In [68]:
df = pd.read_sql_query('''SELECT actor, sum(jp.first_week) as week, sum(jp.total_spectator) as total, count(im.id) as nb_film
FROM films_imdb as im
join
   JSON_TABLE(
     im.casting,
     "$[*]"
     COLUMNS(
       actor VARCHAR(255) PATH "$"
     )
   ) as aa
LEFT JOIN films_jp jp ON jp.id = im.id_jp 
where im.id_jp is not null and im.date = jp.date

group by actor
order by total desc''', engine)

calculate(df, 'actor')

# DISTRIBUTOR

In [69]:
df = pd.read_sql_query('''SELECT dist, sum(jp.first_week) as week, sum(jp.total_spectator) as total, count(im.id) as nb_film
FROM films_imdb as im
join
   JSON_TABLE(
     im.distributor,
     "$[*]"
     COLUMNS(
       dist VARCHAR(255) PATH "$"
     )
   ) as aa
LEFT JOIN films_jp jp ON jp.id = im.id_jp 
where im.id_jp is not null and im.date = jp.date

group by dist
order by total desc''', engine)

df.rename(columns={"dist": "distributor"}, inplace=True)
calculate(df, 'distributor')

In [70]:
q = load_file('director_scores')

q.director.str.find("florent")

0      -1
1      -1
2      -1
3      -1
4      -1
       ..
1599   -1
1600   -1
1601   -1
1602   -1
1603   -1
Name: director, Length: 1604, dtype: int64