# **Bibliotecas e instalações**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import pickle
seed = 10

In [None]:
# Pré-Processamento
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, MaxAbsScaler, PowerTransformer, QuantileTransformer
from sklearn.model_selection import train_test_split

In [None]:
# Classificação
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import plot_confusion_matrix


The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).


The sklearn.neighbors.base module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.neighbors. Anything that cannot be imported from sklearn.neighbors is now part of the private API.



In [None]:
# Regressão
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# **Funções**

## **Gerais**

In [None]:
def read_csv(path):
  """Read csv files

  :param path str: path to the csv file.

  :return: dataframe from the csv file.
  :rtype: pd.DataFrame
  """

  df = pd.read_csv(path)
  print(df.shape)

  return df

In [None]:
def save_csv(df, path):
  """Save csv files

  :param df pd.DataFrame: dataframe to be saved.
  :param path str: path to save the csv file.

  :return: no value
  :rtype: none
  """

  df.to_csv(path, encoding='utf-8', index=False)
  print('Arquivo csv salvo com sucesso!')

In [None]:
def get_dates_diff(df):
  """Get the difference, in days, between columns with dates

  :param df pd.DataFrame: DataFrame to get the dates difference.

  :return: DataFrame with dates difference in nine new columns 
  :rtype: pd.DataFrame
  """
  
  df_aux = df.copy()
  
  df_aux.dropna(subset=['DTTRAT','DTULTINFO'], inplace=True)

  lista_datas = ['DTCONSULT', 'DTDIAG', 'DTTRAT', 'DTULTINFO', 'DTRECIDIVA']
  
  for c in lista_datas:
    if c == 'DTTRAT':
      fmt = '%d/%m/%Y'
    else:
      fmt = '%Y-%m-%d'
    df_aux[c] = pd.to_datetime(df_aux[c], format=fmt)

  df_aux['delta_t1'] = (df_aux.DTDIAG - df_aux.DTCONSULT).dt.days
  df_aux['delta_t2'] = (df_aux.DTTRAT - df_aux.DTDIAG).dt.days
  df_aux['delta_t3'] = (df_aux.DTTRAT - df_aux.DTCONSULT).dt.days

  df_aux['delta_t4'] = (df_aux.DTRECIDIVA - df_aux.DTCONSULT).dt.days
  df_aux['delta_t5'] = (df_aux.DTRECIDIVA - df_aux.DTDIAG).dt.days
  df_aux['delta_t6'] = (df_aux.DTRECIDIVA - df_aux.DTTRAT).dt.days

  df_aux['delta_t7'] = (df_aux.DTULTINFO - df_aux.DTCONSULT).dt.days
  df_aux['delta_t8'] = (df_aux.DTULTINFO - df_aux.DTDIAG).dt.days
  df_aux['delta_t9'] = (df_aux.DTULTINFO - df_aux.DTTRAT).dt.days

  return df_aux

In [None]:
def get_labels(df):
  """Create death labels acording to the last information year.

  :param df pd.DataFrame: dataframe to be processed.

  :return: DataFrame with the new labels
  :rtype: pd.DataFrame
  """

  df_aux = df.copy()

  # Label de óbito
  df_aux['ob'] = 0

  # Label de óbito de acordo com o ano
  df_aux['ano_ob'] = 0
  
  for index, row in df_aux.iterrows():
    if row.ULTINFO > 2:
      df_aux.loc[index, 'ob'] = 1
      if row.delta_t8 < 365:
        df_aux.loc[index, 'ano_ob'] = 1
      elif row.delta_t8 < 2*365:
        df_aux.loc[index, 'ano_ob'] = 2
      elif row.delta_t8 < 3*365:
        df_aux.loc[index, 'ano_ob'] = 3
      elif row.delta_t8 < 4*365:
        df_aux.loc[index, 'ano_ob'] = 4
      elif row.delta_t8 < 5*365:
        df_aux.loc[index, 'ano_ob'] = 5
      else:
        df_aux.loc[index, 'ano_ob'] = 6

  return df_aux

In [None]:
def get_label_rec(df):
  """Create the labels analyzing whether there was recurrence.
  
  :param df pd.DataFrame: dataframe to be processed.

  :return: DataFrame with the new labels
  :rtype: pd.DataFrame
  """

  df_aux = df.copy()

  df_aux['ob_com_rec'] = 0
  df_aux['ob_sem_rec'] = 0
  df_aux['vivo_com_rec'] = 0
  df_aux['vivo_sem_rec'] = 0

  for i, row in df.iterrows():
    if row['ob'] == 1:
      if row.RECNENHUM == 1:
        df_aux.loc[i, 'ob_sem_rec'] = 1
      else:
        df_aux.loc[i, 'ob_com_rec'] = 1
        
    else:
      if row.RECNENHUM == 1:
        df_aux.loc[i, 'vivo_sem_rec'] = 1
      else:
        df_aux.loc[i, 'vivo_com_rec'] = 1

  return df_aux

## **Modelos**

In [None]:
def variables_preprocessing(df):
  """Do some preprocessing on the DataFrame like strings splits, fill NaN values,
     replace values and drop some columns.

  :param df pd.DataFrame: DataFrame to be preprocessed.

  :return: DataFrame after be preprocessed and get some columns removed
  :rtype: pd.DataFrame
  """

  df_aux = df.copy()
  no_info = '**Sem informação**'

  # DRS
  DRS_expand = df_aux.DRS.str.split(' ', expand=True)
  df_aux['DRS'] = DRS_expand[1]
  df_aux.DRS = df_aux.DRS.fillna(0).astype('int64')

  # META
  df_aux.META01.fillna(no_info, inplace=True)
  df_aux.META02.fillna(no_info, inplace=True)
  df_aux.META03.fillna(no_info, inplace=True)
  df_aux.META04.fillna(no_info, inplace=True)

  # REC
  df_aux.REC01.fillna(no_info, inplace=True)
  df_aux.REC02.fillna(no_info, inplace=True)
  df_aux.REC03.fillna(no_info, inplace=True)
  df_aux.REC04.fillna(no_info, inplace=True)

  # PT
  df_aux.PT = df_aux.PT.replace([1.0],'1')
  df_aux.PT = df_aux.PT.str.upper()
  df_aux.PT.fillna(no_info, inplace=True)

  # PN
  df_aux.PN = df_aux.PN.replace([0.0],'0')
  df_aux.PN = df_aux.PN.str.upper()
  df_aux.PN = df_aux.PN.replace(['1BI','IBII','O'],['1B1','1B2','0'])
  df_aux.PN.fillna(no_info, inplace=True)

  # PM
  df_aux.PM = df_aux.PM.replace([0.0],'0')
  df_aux.PM.fillna(no_info, inplace=True)

  # CICI
  df_aux.CICI.fillna(no_info, inplace=True)

  # CICIGRUP
  CICIGRUP_expand = df_aux.CICIGRUP.str.split('  ', expand=True)
  df_aux['CICIGRUP'] = CICIGRUP_expand[0]
  df_aux.CICIGRUP.fillna(no_info, inplace=True)

  # Colunas com valores únicos 
  col = df_aux.columns
  drop_cols = ['S','QUIMIOANT','HORMOANT','TMOANT','IMUNOANT','OUTROANT','ERRO',
               'CIDO', 'UFNASC','CIDADE','DESCTOPO','DESCMORFO','DSCCIDO','CICISUBGRU',
               'INSTORIG', 'OUTRACLA']

  col = col.drop(drop_cols)

  return df_aux[col]

In [None]:
def get_train_test(df, drop_cols, label, test_size=0.25, random_state=10):
  """Get features and label, and then returns train and test dataframes.

  :param df pd.DataFrame: dataframe that will be splitted.
  :param drop_cols list: columns to be removed from the DataFrame.
  :param label str: name of the label column.
  :param test_size float: size of test (default=0.25).
  :param random_state int: value for train_test_split random_state (default=10).

  :return: train and test DataFrames, X_train, X_test, y_train, y_test
  :rtype: pd.DataFrame
  """

  df_aux = df.copy()

  cols = df_aux.columns.drop(drop_cols)
  lb = df_aux[label].copy()
  cols = cols.drop(label)
  feat = df_aux[cols]

  X_train, X_test, y_train, y_test = train_test_split(feat, lb, test_size=test_size, random_state=random_state)
  print(f'X_train = {X_train.shape}, X_test = {X_test.shape}')
  print(f'y_train = {y_train.shape}, y_test = {y_test.shape}')

  return X_train, X_test, y_train, y_test

In [None]:
def train_preprocessing(df, normalizer='StandardScaler', pca=False, pca_components=None, random_state=10):
  """Preprocessing the train dataset.

  :param df pd.DataFrame: DataFrame to be preprocessed.
  :param normalizer str: which normalizer to be fitted to the data (default='StandardScaler').
  :param pca bool: if want to use PCA components set True (default=False).
  :param pca_components int: number of PCA components (default=None).
  :param random_state int: value for pca random_state (default=10).

  :return df: preprocessed train DataFrame 
  :rtype: pd.DataFrame
  :return enc: trained LabelEncoder 
  :rtype: dict
  :return norm: trained normalizer 
  :rtype: object
  :return pca if param pca=True: trained PCA 
  :rtype: object
  """

  df_aux = df.copy()

  list_categorical = df_aux.select_dtypes(include='object').columns

  enc = dict()
  for col in list_categorical:
    enc[col] = LabelEncoder()
    df_aux[col] = enc[col].fit_transform(df_aux[col])

  if normalizer == 'StandardScaler':
    norm = StandardScaler()
  elif normalizer == 'MinMaxScaler':
    norm = MinMaxScaler()
  elif normalizer == 'MaxAbsScaler':
    norm = MaxAbsScaler()
  elif normalizer == 'PowerTransformer':
    norm = PowerTransformer()
  elif normalizer == 'QuantileTransformer':
    norm = QuantileTransformer(output_distribution='normal')
  
  df_aux = norm.fit_transform(df_aux)

  if pca:
    pca = PCA(pca_components, random_state=random_state)
    df_aux = pca.fit_transform(df_aux)

    return df_aux, enc, norm, pca

  else:
    return df_aux, enc, norm

In [None]:
def test_preprocessing(df, enc, norm, pca=None):
  """Preprocessing the test dataset.

  :param df pd.DataFrame: DataFrame to be preprocessed.
  :param enc: trained encoder with the categorical features.
  :param norm: trained normalizer.
  :param pca: trained PCA (default=None).

  :return: preprocessed test DataFrame 
  :rtype: pd.DataFrame
  """

  df_aux = df.copy()

  df_aux.fillna(0, inplace=True)

  list_categorical = df_aux.select_dtypes(include='object').columns

  for col in list_categorical:
    df_aux.loc[~df_aux[col].isin(enc[col].classes_), col] = -1 
    df_aux.loc[df_aux[col].isin(enc[col].classes_), col] = enc[col].transform(df_aux[col][df_aux[col].isin(enc[col].classes_)])

  df_aux = norm.transform(df_aux)

  if pca != None:
    df_aux = pca.transform(df_aux)

  return df_aux 

In [None]:
def plot_feat_importances(model, X_test, n=25):
  """Shows the features importances for the model.

  :param model: machine learning model.
  :param X_test pd.DataFrame: X_test for the model, before preprocessing.
  :param n int: number of features to be shown (default=25).

  :return: no value
  :rtype: none
  """

  feat_import = pd.Series(model.feature_importances_, index=X_test.columns)
  feat_import.nlargest(n).plot(kind='barh', figsize=(10,10))
  plt.show()

In [None]:
def validate_regression(X_test, model, y_test):
  """Validate the regression.

  :param X_test pd.DataFrame: values to be validated 
  :param model: trained machine learning model
  :param y_test array-like: true labels for the regression

  :return: DataFrame comparing the real and predicted values
  :rtype: pd.DataFrame
  """

  y_pred = model.predict(X_test)
  df = pd.DataFrame({'Atual': y_test, 'Predito': y_pred})

  print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.3f}')
  print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred):.3f}')
  print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test, y_pred)):.3f}')
  print(f'Score: {model.score(X_test, y_test):.3f}')

  return df

# **Referências**

https://machinelearningmastery.com/one-vs-rest-and-one-vs-one-for-multi-class-classification/

https://machinelearningmastery.com/robust-regression-for-machine-learning-in-python/

https://stackabuse.com/linear-regression-in-python-with-scikit-learn/

https://machinelearningmastery.com/xgboost-for-regression/