## Ideias

- Discretizar orcamento, popularidade, receita, duracao
- Criar coluna de ano (e mês, talvez), baseando-se na data de estreia
- Normalizar os valores

In [1]:
import pandas as pd

In [11]:
df_movies = pd.read_csv('datasets/movies_amostra.csv')
df_movies.sample(2, random_state=1)

Unnamed: 0,id,titulo,adulto,orcamento,idioma_original,popularidade,data_de_estreia,resumo,receita,duracao,genero,ator_1,ator_2,ator_3,ator_4,ator_5,dirigido_por,escrito_por_1,escrito_por_2,historia_original
1957,40688,Meet the Deedles,False,24000000,en,13.0,1998-03-27,Two surfers end up as Yellowstone park rangers...,4562146.0,93.0,Comedy,Paul Walker,John Ashton,Dennis Hopper,Eric Braeden,Richard Lineback,Steve Boyum,,,
2087,9267,And Now for Something Completely Different,False,0,en,16.0,1971-09-28,And Now for Something Completely Different is ...,0.0,88.0,Comedy,Graham Chapman,John Cleese,Terry Gilliam,Eric Idle,Terry Jones,Ian MacNaughton,Graham Chapman,John Cleese,


## Mapeador de strings para inteiros

In [3]:
from typing import Tuple, Dict 

def int_mapper(df: pd.DataFrame, col: str) -> Tuple[Dict[str, int], pd.Series]:
    uniques = df[col].unique()
    mapper = {}

    for i, v in enumerate(df_movies.genero.unique()):
        mapper[v] = i
    

    new_series = df[col].apply(lambda x: mapper[x])

    return mapper, new_series


## Modificações no DF

In [4]:
from datetime import datetime

df_movies_normalizado = df_movies.copy()

# Mapeamento do genero

mapa_genero, genero_mapeado = int_mapper(df_movies, 'genero')
df_movies_normalizado.genero = genero_mapeado

# Discretização da duração

df_movies_normalizado.duracao = pd.cut(df_movies_normalizado.duracao, 20)
df_movies_normalizado.sample(2)

# Separação do ano

# datetime.strptime()

df_movies_normalizado.sample(2, random_state=1)

Unnamed: 0,id,titulo,adulto,orcamento,idioma_original,popularidade,data_de_estreia,resumo,receita,duracao,genero,ator_1,ator_2,ator_3,ator_4,ator_5,dirigido_por,escrito_por_1,escrito_por_2,historia_original
1957,40688,Meet the Deedles,False,24000000,en,13.0,1998-03-27,Two surfers end up as Yellowstone park rangers...,4562146.0,93.0,1,Paul Walker,John Ashton,Dennis Hopper,Eric Braeden,Richard Lineback,Steve Boyum,,,
2087,9267,And Now for Something Completely Different,False,0,en,16.0,1971-09-28,And Now for Something Completely Different is ...,0.0,88.0,1,Graham Chapman,John Cleese,Terry Gilliam,Eric Idle,Terry Jones,Ian MacNaughton,Graham Chapman,John Cleese,
1394,61391,Distant Drums,False,0,en,12.0,1951-12-25,"After destroying a Seminole fort, American sol...",0.0,101.0,0,Gary Cooper,Mari Aldon,Richard Webb,Ray Teal,Arthur Hunnicutt,Raoul Walsh,Martin Rackin,Niven Busch,
1520,75298,The Fountain,False,0,ru,0.0,1989-09-08,A story of a Moscow's apartment building that ...,0.0,100.0,1,Viktor Mikhaylov,Shanna Kerimtayeva,Asankul Kuttubayev,Sergei Dreiden,Nina Usatova,Yuri Mamin,,,
1098,48375,Brother's Justice,False,0,en,0.0,2010-10-22,"Motivated by Box Office statistics, Dax Shepar...",0.0,80.0,1,Tom Arnold,Jon Favreau,James Feldman,David Koechner,David Palmer,Dax Shepard,,,


In [9]:
from base_am.resultado import Fold
from base_am.avaliacao import Experimento
from competicao_am.metodo_competicao import MetodoCompeticao
from competicao_am.avaliacao_competicao import OtimizacaoObjetivoSVMCompeticao
from sklearn.svm import LinearSVC
import pandas as pd

arr_folds = Fold.gerar_k_folds(df_movies_normalizado, val_k=5, col_classe="genero",
                            num_repeticoes=1, num_folds_validacao=4,num_repeticoes_validacao=1)
scikit_method = LinearSVC(random_state=2)

ml_method = MetodoCompeticao(scikit_method)

ClasseObjetivo = OtimizacaoObjetivoSVMCompeticao
experimento = Experimento(arr_folds, ml_method=ml_method,
                    ClasseObjetivoOtimizacao=ClasseObjetivo,
                    num_trials=2)

print(f"MACRO F1: {experimento.macro_f1_avg}")

0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-17 17:31:20,934] Finished trial#0 with value: 0.7601207076676431 with parameters: {'min_samples_split': 2.9343616008230637}. Best is trial#0 with value: 0.7601207076676431.
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-17 17:31:51,018] Finished trial#1 with value: 0.7607679489978387 with parameters: {'min_samples_split': 4.796536502777316}. Best is trial#1 with value: 0.7607679489978387.
0/2400
1000/2400
2000/2400
0/600
0.7659168723997266
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-17 17:32:31,020] Finished trial#0 with value: 0.7665173671388459 with parameters: {'min_samples_split': 1.431165748120622}. Best is trial#0 with value: 0.7665173671388459.
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
0/1800
1000/1800
0/600
[I 2020-08-17 17:33:02,001] Fi