# Análise Exploratória dos Dados

In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sn
import matplotlib.pyplot as plt
import matplotlib
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
import tempfile
import mlflow
import lightgbm as lgb
from imblearn.over_sampling import SMOTE, SMOTENC

matplotlib.use('Agg')
matplotlib.style.use('ggplot')
pd.set_option('max_rows', 500)
pd.set_option('max_columns', 100)

## Funções de pré-processamento

In [26]:
def process_bd_dem_time(_df, args):
    
    df_bem_time = _df.copy()
    # Processa coluna de nascimento
    column = 'DTA_NASC'
    mask = (df_bem_time[column]>args['ano_antigo']) & (df_bem_time[column]<args['ano_atual'])
    df_bem_time.loc[~mask, column] = None # joga quem ta fora da data para None
    df_bem_time['IDADE'] = args['ano_atual'] - df_bem_time[column]
    df_bem_time = df_bem_time.drop(columns=[column])

    # Processa coluna de sexo
    column='SEXO'
    mask = (df_bem_time[column] == 'M') | (df_bem_time[column] == 'F')
    df_bem_time.loc[~mask, 'SEXO'] = None
    
    dummies = pd.get_dummies(df_bem_time[column]).astype('category')
    df_bem_time = pd.concat([df_bem_time, dummies], axis=1).drop(columns=[column])

    # Processa coluna de estado
    column = 'ESTADO'
    df_bem_time.loc[df_bem_time[column]=='Nao informado', column] = None
    
    dummies = pd.get_dummies(df_bem_time[column]).astype('category')
    df_bem_time = pd.concat([df_bem_time, dummies], axis=1).drop(columns=[column])

    # Processa coluna de time
    column = 'TIME'
    y_categories = df_bem_time[column].astype('category').cat.categories
    df_bem_time[column] = df_bem_time[column].astype('category').cat.codes
    
    return df_bem_time, y_categories

def transform_idade(df, args):
    _df = df.copy()
    
    # Preprocessa coluna de idade
    column='IDADE'
    df_cut, bin_names = pd.cut(_df[column], bins=args['idade_bins'], retbins=True)
    bin_names = bin_names.astype('int').astype('str')
    bin_names = [f'{bin_names[i]}_{bin_names[i+1]}' for i in range(args['idade_bins'])]
    dummies = pd.get_dummies(df_cut).astype('category')
    dummies.columns = [f'{column}_' +x for x in bin_names] 
    _df = pd.concat([_df, dummies], axis=1).drop(columns=[column])
    return _df

def process_servicos(_df, args):
    df_servicos = _df.copy().astype('category')
    return df_servicos

def clean_url(_df, args, y_categories):
    print('Limpando URL')
    df_consumo = _df.copy()
    
    # remove o nome do site, pega apenas os argumentos
    df_consumo['url'] = df_consumo['url'].str.split(".com/").str.get(1)
    
    # Split da url em futebol
    split_futebol = df_consumo['url'].str.split("futebol/")

    # Define o url_region
    column = 'url_region'
    if args['region'][f'{column}_bool']:
        df_consumo[column] = split_futebol.str.get(0).str[:-1]
        df_consumo[column] = df_consumo[column].str.split("/").str.get(0)
        df_consumo[column] = df_consumo[column].str.lower()
        df_consumo.loc[df_consumo[column].str.len() > 2, column] = None
        df_consumo.loc[df_consumo[column] == '', column] = None

    # Split da url em noticia
    split_noticia = split_futebol.str.get(1).str.split("noticia/")

    # Define o url_noticia
    column = 'url_noticia'
    if args['noticia'][f'{column}_bool']:
        df_consumo['url_noticia'] = split_noticia.str.get(1)

    # Define o url_campeonato
    column = 'url_campeonato'
    if args['campeonato'][f'{column}_bool']:
        df_consumo[column] = split_noticia.str.get(0).str.split("times/").str.get(0).str[:-1]
        df_consumo[column] = df_consumo[column].str.split("/").str.get(0)
        df_consumo.loc[df_consumo[column] == '', column] = None
        df_consumo[column] = df_consumo[column].str.replace('-', '')

    # Define o url_times
    column = 'url_times'
    if args['times'][f'{column}_bool']:
        df_consumo[column] = split_noticia.str.get(0).str.split("times/").str.get(1).str[:-1]
        df_consumo[column] = df_consumo[column].str.split("/").str.get(0)
        df_consumo.loc[df_consumo[column] == '', column] = None
        # retira times que não esteja nos times da variável alvo
        df_consumo[column] = df_consumo[column].str.lower().str.replace('-', '').str.replace(' ', '').str.replace('siga', '')
        nome_times = y_categories.str.replace('-', '').str.replace(' ', '').str.lower().unique()
        df_consumo.loc[df_consumo[column].isin(nome_times)==False, column] = None

    # Remove a coluna Url original
    df_consumo.drop(columns=['url'], inplace=True)
    return df_consumo
    
def handle_dummies_column(df_consumo, args, column, drop=True):
    print('Categorizando coluna de ', column)
    dummies = pd.get_dummies(df_consumo[column]).astype('float32')
    dummies.columns = f'{column}_'+dummies.columns
    if args[f'{column}_count']:
        df_consumo = pd.concat([df_consumo, dummies], axis=1)
    if args[f'{column}_time']:
        time_dummies = dummies.apply(lambda x: x*df_consumo['tempo'])
        time_dummies.columns = time_dummies.columns + '_time'
        df_consumo = pd.concat([df_consumo, time_dummies], axis=1)
    if drop:
        df_consumo = df_consumo.drop(columns=[column])
    return df_consumo
    
def process_consumo(_df, args, y_categories):
    df_consumo = _df.copy()
    df_consumo = df_consumo.dropna()
    print('Tamnho do dataset de Consumo', df_consumo.shape)
    
    # Limpa a Coluna de url, transformando-a em 4 colunas
    df_consumo = clean_url(df_consumo, args, y_categories)
    
    # Trata os dados de Regiao
    column = 'url_region'
    if args['region'][f'{column}_bool']:
        df_consumo = handle_dummies_column(df_consumo, args=args['region'], column=column)
    
    # Trata os dados de Campeonato
    column = 'url_campeonato'
    if args['campeonato'][f'{column}_bool']:
        df_consumo = handle_dummies_column(df_consumo, args=args['campeonato'], column=column)
    
    # Trata os dados de Times
    column = 'url_times'
    if args['times'][f'{column}_bool']:
        drop_time=True
        if args['noticia'][f'url_noticia_bool']:
            drop_time=False
        df_consumo = handle_dummies_column(df_consumo, args=args['times'], column=column, drop=drop_time)
    
    # Trata os dados de Noticia
    column = 'url_noticia'
    if args['noticia'][f'{column}_bool']:
        print('Lendo coluna de  url_times')
        url_time_columns = df_consumo.columns[df_consumo.columns.str.startswith('url_times_')]
        for column in url_time_columns:
            nome_time = column.split('_')[-1]
            mask = (df_consumo['url_noticia'].str.contains(nome_time)) & (df_consumo['url_times'].isna())
            print('Nome Antes: ', nome_time, df_consumo.loc[mask, column].sum())
            df_consumo.loc[mask, column] += 1
            print('Nome Depois: ', nome_time, df_consumo.loc[mask, column].sum())

        df_consumo.drop(columns=[column, 'url_times'], inplace=True)
    
    # Agrega o consumo por Usuário
    df_consumo = df_consumo.groupby('KEY').sum().reset_index()
    
    return df_consumo

def concat_data(df_bem_time, df_servicos, df_consumo):
    return df_bem_time.set_index('KEY').join(df_servicos.set_index('KEY')).join(df_consumo.set_index('KEY')).fillna(0)

def split_X_y(df):
    target_column = 'TIME'
    return df.drop(columns=[target_column]), df[target_column]

## Funções de Treinamento

In [3]:
def get_model(model_args):
    if model_args['model_name'] == 'SVM':
        model = LinearSVC(C=model_args['C'])
    return model

def create_heatmap(y_test, y_pred, y_categories, name_file='example.png', normalize=False):
    
    fig = plt.figure(figsize = (20, 16))
    
    cnf_matrix = confusion_matrix(y_test, y_pred)
    df_cm = pd.DataFrame(cnf_matrix, index = y_categories, columns = y_categories)
    
    annot=False
    title='Matriz de Confusão'
    if normalize:
        df_cm=(df_cm-df_cm.mean())/df_cm.std() #normaliza a matriz
        df_cm=df_cm.round(2)
        annot=True
        title = title + ' Normalizada'
    
    plt.title(title)
    sns_plot = sn.heatmap(df_cm, annot=annot, cmap="YlGnBu", linewidths=.5)
            
    # Save Figure
    dirpath = tempfile.mkdtemp()
    save_path = os.path.join(dirpath, name_file)
    fig = sns_plot.get_figure()
    fig.savefig(save_path)
    plt.close(fig)
    return save_path

def get_metrics(y_test, y_pred):
    metrics = {
        'acc': accuracy_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred, average='weighted')
    }
    return metrics

def get_general_metrics(json_metrics, test=False):
    df_metrics = pd.DataFrame.from_dict(json_metrics)
    df_metrics = df_metrics.mean(axis=1)
    df_metrics.index = 'train_'+df_metrics.index
    return df_metrics.to_dict()

def recursive_log_params(dict_params):
    for key, value in dict_params.items():
        if isinstance(value, dict):
            recursive_log_params(value)
        else: 
            mlflow.log_param(key, value)
    return

def model_fit(X_train, y_train, model_args):
    if model_args['model_name'] == 'SVM':
        model = LinearSVC(C=model_args['C'])
        model.fit(X_train, y_train)
    elif model_args['model_name'] == 'LGBM':
        params = model_args.copy()
        params.pop('model_name')
        d_train=lgb.Dataset(X_train, label=y_train, free_raw_data=False)
        model=lgb.train(params, d_train)
    return model

def model_predict(model, X_test, model_args):
    y_pred = model.predict(X_test)
    if model_args['model_name'] == 'LGBM':
        y_pred=[np.argmax(line) for line in y_pred]
    return y_pred

## Hiperparametros

In [4]:
hiperparams = {
    "data": {
        "test_split":0.2,
        "n_folds": 5, # numero de folds
        "smote": True, 
        "pca": 1,
        "bd_dem_time":{
            "ano_atual": 2021,
            "ano_antigo": 1920,
            "idade_bins": 20
        },
        "servicos": {
            
        },
        "consumo": {
            "nrows": 1500000000,  # numero de linha a ler na tabela de consumo, usar um big M para ler tudo
            # Regiao não terá time ==True
            "region":{
                "url_region_bool": True,
                "url_region_count": True,
                "url_region_time": False,
            },
            "campeonato":{
                "url_campeonato_bool": False,
                "url_campeonato_count": True,
                "url_campeonato_time": False,
            },
            "times":{
                "url_times_bool": True,
                "url_times_count": True,
                "url_times_time": False,
            },
            # Noticia não terá time ==True
            "noticia":{
                "url_noticia_bool": False,
                "url_noticia_count": True,
                "url_noticia_time": False,
            }
        }
    },
    #"model": {
    #    "model_name": "SVM",
    #    "C": 0.1
    #}
    "model": {
        "model_name": "LGBM",
        "boosting_type": "gbdt", #GradientBoostingDecisionTree
        "objective": "multiclass", #Multi-class target feature
        "metric": "multi_logloss", #metric for multi-class
        "num_class": 29, #no.of unique values in the target class not inclusive of the end value
        #"force_row_wise": True
        #"feature_fraction": 0.8,
        "min_data_in_leaf": 100,
        "feature_fraction": 0.8,
        "max_depth": 9,
        "num_leaves": 60,
        "verbose": -1,
    }
}

# Inicio do Pipeline

In [6]:
data_path = 'dataset'
files = [x for x in os.listdir(data_path) if x.endswith('.csv')]
files

['BD_SERVICOS.csv', 'BD_CONSUMO.csv', 'BD_DEM_TIME.csv']

In [7]:
# Abre o Arquivo de bd dem time
df_bem_time = pd.read_csv(os.path.join(data_path, 'BD_DEM_TIME.csv'))

In [8]:
# Divide entre Trein e Teste
df_bem_time_train, df_bem_time_test = train_test_split(df_bem_time, test_size=hiperparams['data']['test_split'], random_state=42)

In [9]:
# Abre o Arquivo de servicos
df_servicos = pd.read_csv(os.path.join(data_path, 'BD_SERVICOS.csv'))
# Abre o Arquivo de consumo
df_consumo = pd.read_csv(os.path.join(data_path, 'BD_CONSUMO.csv'), nrows=hiperparams['data']['consumo']['nrows'])

In [10]:
df_bem_time_train_processed, y_categories = process_bd_dem_time(df_bem_time_train, 
                                                                hiperparams["data"]["bd_dem_time"])

In [11]:
df_servicos_processed = process_servicos(df_servicos, hiperparams["data"]["servicos"])
df_consumo_processed = process_consumo(df_consumo, hiperparams["data"]["consumo"], y_categories)

Tamnho do dataset de Consumo (14174706, 3)
Limpando URL
Categorizando coluna de  url_region
Categorizando coluna de  url_times


In [12]:
df = concat_data(df_bem_time_train_processed, df_servicos_processed, df_consumo_processed)
X, y = split_X_y(df)

In [25]:
kf = KFold(n_splits=hiperparams['data']['n_folds'], shuffle=True, random_state=12345)

with mlflow.start_run():
    
    recursive_log_params(dict_params=hiperparams)
    
    metrics = {}
    for fold_number, (train_index, test_index) in enumerate(kf.split(X)):
        step = fold_number+1

        print("TRAIN FOLD: ", step)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        if hiperparams['data']['pca'] < 1:
            pca = PCA(n_components=hiperparams['data']['pca'])
            X_train = pca.fit_transform(X_train)
            X_test = pca.transform(X_test)

        if hiperparams['data']['smote']:
            # x percentual de valores da classe mais representativa
            sample = y_train.value_counts()
            min_desired_data = sample.median()
            sample[sample<min_desired_data] = min_desired_data

            oversample = SMOTENC(
                X_train.dtypes=='category', 
                sampling_strategy=sample.to_dict(), 
                random_state=12345, 
                k_neighbors=5, 
                n_jobs=-1)
            X_train, y_train = oversample.fit_resample(X_train, y_train)
            
        X_train = transform_idade(X_train, hiperparams["data"]["bd_dem_time"])
        X_test = transform_idade(X_test, hiperparams["data"]["bd_dem_time"])
        
        continuous_features = X_train.select_dtypes(exclude=['category']).columns
        sc=StandardScaler()
        X_train[continuous_features]=sc.fit_transform(X_train[continuous_features])
        X_test[continuous_features]=sc.transform(X_test[continuous_features])
        
        model = model_fit(X_train, y_train, model_args=hiperparams['model'])
    
        # Faz a Predição
        y_pred = model_predict(model, X_test, model_args=hiperparams['model'])

        fold_metric = get_metrics(y_test, y_pred)
        mlflow.log_metrics(fold_metric, step=step)
        metrics[step] = fold_metric

        # Cria e salva heatmap
        save_path = create_heatmap(y_test, y_pred, y_categories, name_file=f'confusion_matrix_fold_{step}.png')
        mlflow.log_artifact(save_path)
        
        # Cria e salva heatmap normalizado
        save_path = create_heatmap(y_test, y_pred, y_categories, 
                                   name_file=f'confusion_matrix_fold_{step}_normalized.png', normalize=True)
        mlflow.log_artifact(save_path)

    summary_metrics = get_general_metrics(metrics) 
    mlflow.log_metrics(summary_metrics)

TRAIN FOLD:  1
TRAIN FOLD:  2
TRAIN FOLD:  3
TRAIN FOLD:  4
TRAIN FOLD:  5


# Holdout para o teste

In [57]:
df_bem_time_test_processed, y_categories = process_bd_dem_time(df_bem_time_test, 
                                                                hiperparams["data"]["bd_dem_time"])

In [58]:
df_test = concat_data(df_bem_time_test_processed, df_servicos_processed, df_consumo_processed)
X_final, y_final = split_X_y(df_test)

In [59]:
X_train, y_train = X, y

In [63]:
X_train.shape, X_final.shape

((832062, 114), (190293, 114))

In [61]:
sc=StandardScaler()
X_train=pd.DataFrame(sc.fit_transform(X_train))
X_final=pd.DataFrame(sc.transform(X_final))

In [62]:
if hiperparams['data']['smote']:
    # x percentual de valores da classe mais representativa
    sample = y_train.value_counts()
    min_desired_data = sample.median()
    sample[sample<min_desired_data] = min_desired_data

    oversample = SMOTE(sampling_strategy=sample.to_dict(), random_state=12345, k_neighbors=5, n_jobs=-1)
    X_train, y_train = oversample.fit_resample(X_train, y_train)

In [36]:
model = model_fit(X_train, y_train, model_args=hiperparams['model'])
    
# Faz a Predição
y_pred = model_predict(model, X_final, model_args=hiperparams['model'])

fold_metric = get_metrics(y_final, y_pred)

# Cria e salva heatmap
save_path = create_heatmap(y_final, y_pred, y_categories, name_file=f'confusion_matrix_fold_{step}.png')

In [44]:
save_path = create_heatmap(y_final, y_pred, y_categories, 
                                   name_file=f'confusion_matrix_fold_{step}_normalized.png', normalize=True)

In [47]:
fold_metric

{'acc': 0.6594409673503492, 'f1': 0.658173888096062}