## Ideias

- Discretizar orcamento, popularidade, receita, duracao
- Criar coluna de ano (e mês, talvez), baseando-se na data de estreia
- Normalizar os valores

In [1]:
import pandas as pd

In [2]:
df_movies = pd.read_csv('datasets/movies_amostra.csv')
df_movies.sample(2, random_state=1)

Unnamed: 0,id,titulo,adulto,orcamento,idioma_original,popularidade,data_de_estreia,resumo,receita,duracao,genero,ator_1,ator_2,ator_3,ator_4,ator_5,dirigido_por,escrito_por_1,escrito_por_2,historia_original
1957,40688,Meet the Deedles,False,24000000,en,13.0,1998-03-27,Two surfers end up as Yellowstone park rangers...,4562146.0,93.0,Comedy,Paul Walker,John Ashton,Dennis Hopper,Eric Braeden,Richard Lineback,Steve Boyum,,,
2087,9267,And Now for Something Completely Different,False,0,en,16.0,1971-09-28,And Now for Something Completely Different is ...,0.0,88.0,Comedy,Graham Chapman,John Cleese,Terry Gilliam,Eric Idle,Terry Jones,Ian MacNaughton,Graham Chapman,John Cleese,


## Modificações no DF

In [3]:
from typing import Tuple, Dict 
from datetime import datetime
from sklearn import preprocessing, compose

def adjust_df_treino(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, object]]:
    df_copy = df.copy()

    bins = {}

    # Ajustes

    idioma_encoder = None
    # idioma_encoder = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False)

    # idioma_encoder.fit(df_copy[["idioma_original"]].dropna())
    # oh_idiomas = pd.DataFrame(idioma_encoder.transform(df_copy[["idioma_original"]].fillna('unknown')), columns=idioma_encoder.get_feature_names(["idioma"]))
    # df_copy = df_copy.drop(columns="idioma_original")

    # df_copy = pd.concat([df_copy, oh_idiomas], axis=1)
    
    # Discretizações

    ## Talvez discretizar a popularidade não seja a melhor approach

    df_copy.orcamento, bins['orcamento'] = pd.cut(df_copy.orcamento, 30, retbins=True)
    df_copy.popularidade, bins['popularidade'] = pd.cut(df_copy.popularidade, 30, retbins=True)
    df_copy.receita, bins['receita'] = pd.cut(df_copy.receita, 30, retbins=True)
    df_copy.duracao, bins['duracao'] = pd.cut(df_copy.duracao, 30, retbins=True)


    # Separação do ano

    # df_copy.ano = pd.to_numeric(df_movies.data_de_estreia.str.extract(r'^(\d{4})-.+$').iloc[:, 0])

    # Remoção de colunas

    ## Remoção da história original
    ## Remoção da data de estreia (o ano deve funcionar melhor)

    # df_copy = df_copy.drop(columns=["data_de_estreia", "historia_original"])

    
    # limit = df_movies.genero.value_counts().min()

    # df_copy = pd.concat([
    #     df_copy.query('genero=="Comedy"').sample(limit, random_state=1), 
    #     df_copy.query('genero=="Action"').sample(limit, random_state=1)
    # ])
    
    parameters = {
        'bins': bins,
        'idioma_encoder': idioma_encoder
    }

    return df_copy, parameters

In [4]:
def adjust_df(df: pd.DataFrame, parameters: Dict[str, object]) -> pd.DataFrame:
    df_copy = df.copy()

    idioma_encoder = parameters['idioma_encoder']

    oh_idiomas = pd.DataFrame(idioma_encoder.transform(df_copy[["idioma_original"]].fillna('unknown')), columns=idioma_encoder.get_feature_names(["idioma"]))
    df_copy = df_copy.drop(columns="idioma_original")

    df_copy = pd.concat([df_copy, oh_idiomas], axis=1)
    

    # Discretizações

    ## Talvez discretizar a popularidade não seja a melhor approach

    df_copy.orcamento = pd.cut(df_copy.orcamento, parameters['bins']['orcamento'])
    df_copy.popularidade = pd.cut(df_copy.popularidade, parameters['bins']['popularidade'])
    df_copy.receita = pd.cut(df_copy.receita, parameters['bins']['receita'])
    df_copy.duracao = pd.cut(df_copy.duracao, parameters['bins']['duracao'])

    # Separação do ano

    df_copy.ano = pd.to_numeric(df_movies.data_de_estreia.str.extract(r'^(\d{4})-.+$').iloc[:, 0])

    # Remoção de colunas

    ## Remoção da história original
    ## Remoção da data de estreia (o ano deve funcionar melhor)

    df_copy = df_copy.drop(columns=["data_de_estreia", "historia_original"])

    return df_copy

In [5]:
df_movies_adj, adj_parameters = adjust_df_treino(df_movies)

df_movies_adj.sample(2, random_state=2)

Unnamed: 0,id,titulo,adulto,orcamento,idioma_original,popularidade,data_de_estreia,resumo,receita,duracao,genero,ator_1,ator_2,ator_3,ator_4,ator_5,dirigido_por,escrito_por_1,escrito_por_2,historia_original
1291,71910,Sitting Target,False,"(-270000.0, 9000000.0]",en,"(-0.224, 7.467]",1972-06-19,"Imprisoned Harry Lomart is a vicious, brute of...","(-1118888.979, 37296299.3]","(70.5, 94.0]",Action,Oliver Reed,Jill St. John,Ian McShane,Edward Woodward,Frank Finlay,Douglas Hickox,,,
595,163590,Metal Brothers,False,"(-270000.0, 9000000.0]",sv,"(-0.224, 7.467]",2012-12-24,Bröderna Hårdrock is a film about two brothers...,"(-1118888.979, 37296299.3]","(94.0, 117.5]",Comedy,Lotta Tejle,Johan Östling,Björn Starrin,Kjell Bergqvist,Mia Skäringer,Ulf Malmros,Ulf Malmros,,


In [6]:
from base_am.resultado import Fold
from base_am.avaliacao import Experimento
from competicao_am.metodo_competicao import MetodoCompeticao
from competicao_am.avaliacao_competicao import OtimizacaoObjetivoSVMCompeticao
from sklearn.svm import LinearSVC
import pandas as pd

arr_folds = Fold.gerar_k_folds(df_movies_adj, val_k=5, col_classe="genero",
                            num_repeticoes=1, num_folds_validacao=4,num_repeticoes_validacao=1)
scikit_method = LinearSVC(random_state=2)

ml_method = MetodoCompeticao(scikit_method)

ClasseObjetivo = OtimizacaoObjetivoSVMCompeticao
experimento = Experimento(arr_folds, ml_method=ml_method,
                    ClasseObjetivoOtimizacao=ClasseObjetivo,
                    num_trials=2)

print(f"MACRO F1: {experimento.macro_f1_avg}")

0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-19 15:34:56,785] Finished trial#0 with value: 0.6412301619682137 with parameters: {'min_samples_split': 2.919154032918018}. Best is trial#0 with value: 0.6412301619682137.
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-19 15:36:19,294] Finished trial#1 with value: 0.6418117185504726 with parameters: {'min_samples_split': 5.042271454095107}. Best is trial#1 with value: 0.6418117185504726.
0/2400
1000/2400
2000/2400
0/600
0.6595961228342233
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-19 15:38:09,076] Finished trial#0 with value: 0.5883676781098452 with parameters: {'min_samples_split': 0.0008006237214142065}. Best is trial#0 with value: 0.5883676781098452.
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-19 15:39:35,021]

In [8]:
from base_am.resultado import Fold
from base_am.avaliacao import Experimento
from competicao_am.metodo_competicao import MetodoCompeticao
from competicao_am.avaliacao_competicao import OtimizacaoObjetivoSVMCompeticao
from sklearn.svm import LinearSVC
import pandas as pd

arr_folds = Fold.gerar_k_folds(df_movies_adj, val_k=5, col_classe="genero",
                            num_repeticoes=1, num_folds_validacao=4,num_repeticoes_validacao=1)
scikit_method = LinearSVC(random_state=2)

ml_method = MetodoCompeticao(scikit_method)

ClasseObjetivo = OtimizacaoObjetivoSVMCompeticao
experimento = Experimento(arr_folds, ml_method=ml_method,
                    ClasseObjetivoOtimizacao=ClasseObjetivo,
                    num_trials=2)

print(f"MACRO F1: {experimento.macro_f1_avg}")

0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-19 16:01:01,707] Finished trial#0 with value: 0.6418117185504726 with parameters: {'min_samples_split': 5.605211980728757}. Best is trial#0 with value: 0.6418117185504726.
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-19 16:02:26,878] Finished trial#1 with value: 0.6425031398822847 with parameters: {'min_samples_split': 6.777831030035783}. Best is trial#1 with value: 0.6425031398822847.
0/2400
1000/2400
2000/2400
0/600
0.6581501668062439
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-19 16:04:17,450] Finished trial#0 with value: 0.568304601964621 with parameters: {'min_samples_split': 2.1939692471147}. Best is trial#0 with value: 0.568304601964621.
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-19 16:05:41,567] Finishe

In [9]:
from base_am.resultado import Fold
from base_am.avaliacao import Experimento
from competicao_am.metodo_competicao import MetodoCompeticao
from competicao_am.avaliacao_competicao import OtimizacaoObjetivoSVMCompeticao
from sklearn.svm import LinearSVC
import pandas as pd

arr_folds = Fold.gerar_k_folds(df_movies_adj, val_k=5, col_classe="genero",
                            num_repeticoes=1, num_folds_validacao=4,num_repeticoes_validacao=1)
scikit_method = LinearSVC(random_state=2)

ml_method = MetodoCompeticao(scikit_method)

ClasseObjetivo = OtimizacaoObjetivoSVMCompeticao
experimento = Experimento(arr_folds, ml_method=ml_method,
                    ClasseObjetivoOtimizacao=ClasseObjetivo,
                    num_trials=2)

print(f"MACRO F1: {experimento.macro_f1_avg}")

0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-19 16:27:12,051] Finished trial#0 with value: 0.6445736810072953 with parameters: {'min_samples_split': 0.6884278368313507}. Best is trial#0 with value: 0.6445736810072953.
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-19 16:28:38,206] Finished trial#1 with value: 0.6412301619682137 with parameters: {'min_samples_split': 2.947753375035365}. Best is trial#0 with value: 0.6445736810072953.
0/2400
1000/2400
2000/2400
0/600
0.6614420062695925
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-19 16:30:28,563] Finished trial#0 with value: 0.564875261429319 with parameters: {'min_samples_split': 6.7052267110535135}. Best is trial#0 with value: 0.564875261429319.
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-19 16:31:52,503] Fin