# Carregamento e Pré-processamento

## One-hot encoding

### Dataset da IBM

In [1]:
import pandas as pd
# Lendo o arquivo XLS
df_IBM = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.xls')

selected_columns = [
    'tenure', 
    'MonthlyCharges', 
    'Contract', 
    'InternetService', 
    'TechSupport', 
    'Churn'
]
# Filtrar atributos do dataset
df_IBM = df_IBM[selected_columns]

# Nomear os atributos de maneira a serem compativeis entre datasets
df_IBM = df_IBM.rename(columns={'tenure': 'Tenure'})
df_IBM = df_IBM.rename(columns={'Contract': 'ContractType'})

df_IBM['TechSupport'] = df_IBM['TechSupport'].replace('No internet service', 'No')
df_IBM['InternetService'] = df_IBM['InternetService'].replace('Fiber optic', 'Fiber Optic')
df_IBM['ContractType'] = df_IBM['ContractType'].replace('Month-to-month', 'Month-to-Month')
df_IBM['ContractType'] = df_IBM['ContractType'].replace('Two year', 'Two-Year')
df_IBM['ContractType'] = df_IBM['ContractType'].replace('One year', 'One-Year')


# Tornar em binário os valores categoricos através do one-hot encoding
df_IBM = pd.get_dummies(df_IBM, columns=['ContractType', 'InternetService'], drop_first=False, dtype=int)
df_IBM = pd.get_dummies(df_IBM, columns=['TechSupport', 'Churn'], drop_first=True, dtype=int)



### Dataset Kaggle

In [2]:
import pandas as pd

df = pd.read_csv('data/customer_churn_data.csv')

selected_columns = [
    'Tenure', 
    'MonthlyCharges', 
    'ContractType', 
    'InternetService', 
    'TechSupport', 
    'Churn'
]
# Filtrar atributos do dataset
df = df[selected_columns]

# Substituir os valores NaN por "Nenhum"
df['InternetService'] = df['InternetService'].fillna('No')
# Tornar em binário os valores categoricos através do one-hot encoding
df = pd.get_dummies(df, columns=['ContractType', 'InternetService'], drop_first=False, dtype=int)
df = pd.get_dummies(df, columns=["TechSupport", "Churn"], drop_first=True, dtype=int)

## Normalização

### Dataset da IBM

In [3]:
from sklearn.preprocessing import StandardScaler

columns_to_normalize = ['Tenure', 'MonthlyCharges']
scaler = StandardScaler()

# Normalizar colunas numéricas
df_IBM[columns_to_normalize] = scaler.fit_transform(df_IBM[columns_to_normalize])

# Preparação dos dados no formato de treino
X_treino = df_IBM.drop(columns=['Churn_Yes'])
y_treino = df_IBM['Churn_Yes']

### Dataset Kaggle

In [4]:
from sklearn.preprocessing import StandardScaler

columns_to_normalize = [ 'Tenure', 'MonthlyCharges']

# Normalizar colunas numéricas
df[columns_to_normalize] = scaler.transform(df[columns_to_normalize])

# Preparação dos dados no formato de treino
X_test = df.drop(columns=['Churn_Yes'])
y_test = df['Churn_Yes']

# Reordenar o segundo dataset para ter a mesma ordem do primeiro
df_IBM = df_IBM[df.columns]

## Balanceamento

In [5]:
from imblearn.under_sampling import RandomUnderSampler
import numpy as np

undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_res_treino, y_res_treino = undersampler.fit_resample(X_treino, y_treino)
X_res_test, y_res_test = undersampler.fit_resample(X_test, y_test)

# Treino e Avaliação Treino: IBM / Teste: Kaggle

In [6]:
from scipy.integrate import trapezoid
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, recall_score, roc_curve, precision_score

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# treino IBM -> test Kaggle


# Modelos de Classificação
classifiers = {
    "RF": RandomForestClassifier(random_state=42),
    "SVM": SVC(kernel='linear', random_state=42, probability=True),
    "XGBoost": XGBClassifier(random_state=42, n_estimators=3, max_depth=2, learning_rate=1, objective='binary:logistic'),
    "DT": DecisionTreeClassifier(random_state=42),
    "LR": LogisticRegression(random_state=42, max_iter=1000),
    "NB": GaussianNB()
}

# Resultados
results = {}

# Avaliar Classificação
for name, model in classifiers.items():
    # Treinar com o primeiro dataset
    model.fit(X_res_treino, y_res_treino)
    
    # Avaliar com o segundo dataset
    predictions = model.predict(X_res_test)
    predictions_prob = model.predict_proba(X_res_test)[:, 1]  # Probabilidades para calcular AUC

    # Calcular as métricas
    f1 = f1_score(y_res_test, predictions, average='weighted')
    auc = roc_auc_score(y_res_test, predictions_prob)
    accuracy = accuracy_score(y_res_test, predictions)
    recall = recall_score(y_res_test, predictions)
    precision = precision_score(y_res_test, predictions)

     # Calcular a curva ROC
    fpr, tpr, thresholds = roc_curve(y_res_test, predictions_prob)  # FPR e TPR para calcular a curva ROC
    roc = trapezoid(tpr, fpr)  # Calculando a área sob a curva ROC (AUC)
    
    # Armazenar os resultados
    results[name] = {
        "Precision": precision,
        "Accuracy": accuracy,
        "Recall": recall,
        "F1 Score": f1,
        "AUC": auc
    }
results_df = pd.DataFrame(results).T
results_df = results_df.round(3)
print(results_df)

         Precision  Accuracy  Recall  F1 Score    AUC
RF           0.833     0.620   0.299     0.576  0.705
SVM          1.000     0.748   0.496     0.731  0.853
XGBoost      1.000     0.679   0.359     0.643  0.789
DT           0.459     0.470   0.333     0.460  0.470
LR           0.890     0.744   0.556     0.734  0.834
NB           1.000     0.748   0.496     0.731  0.830


# Treino e Avaliação Treino: Kaggle / Teste: IBM

In [7]:
from scipy.integrate import trapezoid
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, recall_score, roc_curve, precision_score

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


# treino Kaggle -> test IBM
x_aux = X_res_treino
y_aux = y_res_treino
X_res_treino = X_res_test
y_res_treino = y_res_test
X_res_test = x_aux
y_res_test = y_aux


# Modelos de Classificação
classifiers = {
    "RF": RandomForestClassifier(random_state=42),
    "SVM": SVC(kernel='linear', random_state=42, probability=True),
    "XGBoost": XGBClassifier(random_state=42, n_estimators=3, max_depth=2, learning_rate=1, objective='binary:logistic'),
    "DT": DecisionTreeClassifier(random_state=42),
    "LR": LogisticRegression(random_state=42, max_iter=1000),
    "NB": GaussianNB()
}

# Resultados
results = {}

# Avaliar Classificação
for name, model in classifiers.items():
    # Treinar com o primeiro dataset
    model.fit(X_res_treino, y_res_treino)
    
    # Avaliar com o segundo dataset
    predictions = model.predict(X_res_test)
    predictions_prob = model.predict_proba(X_res_test)[:, 1]  # Probabilidades para calcular AUC

    # Calcular as métricas
    f1 = f1_score(y_res_test, predictions, average='weighted')
    auc = roc_auc_score(y_res_test, predictions_prob)
    accuracy = accuracy_score(y_res_test, predictions)
    recall = recall_score(y_res_test, predictions)
    precision = precision_score(y_res_test, predictions)

     # Calcular a curva ROC
    fpr, tpr, thresholds = roc_curve(y_res_test, predictions_prob)
    roc = trapezoid(tpr, fpr)
    
    # Armazenar os resultados
    results[name] = {
        "Precision": precision,
        "Accuracy": accuracy,
        "Recall": recall,
        "F1 Score": f1,
        "AUC": auc
    }
results_df = pd.DataFrame(results).T
results_df = results_df.round(3)
print(results_df)

         Precision  Accuracy  Recall  F1 Score    AUC
RF           0.544     0.579   0.973     0.501  0.726
SVM          0.560     0.601   0.944     0.547  0.770
XGBoost      0.558     0.598   0.945     0.543  0.732
DT           0.544     0.578   0.973     0.500  0.578
LR           0.616     0.673   0.922     0.652  0.796
NB           0.560     0.601   0.944     0.547  0.604
