# Carregamento e Pré-processamento

## One-hot encoding

### Dataset da IBM

In [42]:
import pandas as pd
# Lendo o arquivo XLS
df_IBM = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.xls')

selected_columns = [
    'tenure', 
    'MonthlyCharges', 
    'Contract', 
    'InternetService', 
    'TechSupport', 
    'Churn'
]
# Filtrar atributos do dataset
df_IBM = df_IBM[selected_columns]

# Nomear os atributos de maneira a serem compativeis entre datasets
df_IBM = df_IBM.rename(columns={'tenure': 'Tenure'})
df_IBM = df_IBM.rename(columns={'Contract': 'ContractType'})

df_IBM['TechSupport'] = df_IBM['TechSupport'].replace('No internet service', 'No')
df_IBM['InternetService'] = df_IBM['InternetService'].replace('Fiber optic', 'Fiber Optic')
df_IBM['ContractType'] = df_IBM['ContractType'].replace('Month-to-month', 'Month-to-Month')
df_IBM['ContractType'] = df_IBM['ContractType'].replace('Two year', 'Two-Year')
df_IBM['ContractType'] = df_IBM['ContractType'].replace('One year', 'One-Year')


# Tornar em binário os valores categoricos através do one-hot encoding
df_IBM = pd.get_dummies(df_IBM, columns=['ContractType', 'InternetService'], drop_first=False, dtype=int)
df_IBM = pd.get_dummies(df_IBM, columns=['TechSupport', 'Churn'], drop_first=True, dtype=int)



### Dataset Kaggle

In [43]:
import pandas as pd

df = pd.read_csv('data/customer_churn_data.csv')

selected_columns = [
    'Tenure', 
    'MonthlyCharges', 
    'ContractType', 
    'InternetService', 
    'TechSupport', 
    'Churn'
]
# Filtrar atributos do dataset
df = df[selected_columns]

# Substituir os valores NaN por "Nenhum"
df['InternetService'] = df['InternetService'].fillna('No')
# Tornar em binário os valores categoricos através do one-hot encoding
df = pd.get_dummies(df, columns=['ContractType', 'InternetService'], drop_first=False, dtype=int)
df = pd.get_dummies(df, columns=["TechSupport", "Churn"], drop_first=True, dtype=int)

## Normalização

### Dataset da IBM

In [44]:
from sklearn.preprocessing import StandardScaler

columns_to_normalize = ['Tenure', 'MonthlyCharges']
scaler = StandardScaler()

# Normalizar colunas numéricas
df_IBM[columns_to_normalize] = scaler.fit_transform(df_IBM[columns_to_normalize])

# Preparação dos dados no formato de treino
X_IBM = df_IBM.drop(columns=['Churn_Yes'])
y_IBM = df_IBM['Churn_Yes']

### Dataset Kaggle

In [45]:
from sklearn.preprocessing import StandardScaler
'''
columns_to_normalize = [ 'Tenure', 'MonthlyCharges']

# Normalizar colunas numéricas
df[columns_to_normalize] = scaler.transform(df[columns_to_normalize])
'''
# Preparação dos dados no formato de treino
X_kaggle = df.drop(columns=['Churn_Yes'])
y_kaggle = df['Churn_Yes']

# Reordenar o segundo dataset para ter a mesma ordem do primeiro
df_IBM = df_IBM[df.columns]

## Balanceamento

In [46]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import RandomUnderSampler

'''
##Smote + Tomek
smote = SMOTE(sampling_strategy='auto', random_state=42)
tomek = TomekLinks(sampling_strategy='majority')

X_res_treino, y_res_treino = smote.fit_resample(X_treino, y_treino)
X_res_treino, y_res_treino = tomek.fit_resample(X_res_treino, y_res_treino)

X_res_test, y_res_test = smote.fit_resample(X_test, y_test)
X_res_test, y_res_test = tomek.fit_resample(X_res_test, y_res_test)'''

"\n##Smote + Tomek\nsmote = SMOTE(sampling_strategy='auto', random_state=42)\ntomek = TomekLinks(sampling_strategy='majority')\n\nX_res_treino, y_res_treino = smote.fit_resample(X_treino, y_treino)\nX_res_treino, y_res_treino = tomek.fit_resample(X_res_treino, y_res_treino)\n\nX_res_test, y_res_test = smote.fit_resample(X_test, y_test)\nX_res_test, y_res_test = tomek.fit_resample(X_res_test, y_res_test)"

# Treino e Avaliação Treino: IBM / Teste: Kaggle (OVERSAMPLING)

In [47]:
from scipy.integrate import trapezoid
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, recall_score, roc_curve, precision_score

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# treino IBM -> test Kaggle

X_treino = X_IBM.copy()
y_treino = y_IBM.copy()

X_test = X_kaggle.copy()
y_test = y_kaggle.copy()

#normalização
columns_to_normalize = ['Tenure', 'MonthlyCharges']
scaler = StandardScaler()
X_treino[columns_to_normalize] = scaler.fit_transform(X_treino[columns_to_normalize])
X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

#Balanceamento 
smote = SMOTE(sampling_strategy='auto', random_state=42)
#tomek = TomekLinks(sampling_strategy='majority')

X_res_treino, y_res_treino = smote.fit_resample(X_treino, y_treino)
#X_res_treino, y_res_treino = tomek.fit_resample(X_res_treino, y_res_treino)

y_res_test = y_test
X_res_test = X_test

#X_res_test, y_res_test = smote.fit_resample(X_test, y_test)
#X_res_test, y_res_test = tomek.fit_resample(X_res_test, y_res_test)

print(y_res_treino.value_counts())
print(y_res_test.value_counts())


# Modelos de Classificação
classifiers = {
    "RF": RandomForestClassifier(random_state=42),
    "SVM": SVC(kernel='linear', random_state=42, probability=True),
    "XGBoost": XGBClassifier(random_state=42, n_estimators=3, max_depth=2, learning_rate=1, objective='binary:logistic'),
    "DT": DecisionTreeClassifier(random_state=42),
    "LR": LogisticRegression(random_state=42, max_iter=1000),
    "NB": GaussianNB()
}

# Resultados
results = {}

# Avaliar Classificação
for name, model in classifiers.items():
    # Treinar com o primeiro dataset
    model.fit(X_res_treino, y_res_treino)
    
    # Avaliar com o segundo dataset
    predictions = model.predict(X_res_test)
    print(set(predictions))
    predictions_prob = model.predict_proba(X_res_test)[:, 1]  # Probabilidades para calcular AUC

    # Calcular as métricas
    f1 = f1_score(y_res_test, predictions, average='weighted')
    auc = roc_auc_score(y_res_test, predictions_prob)
    accuracy = accuracy_score(y_res_test, predictions)
    recall = recall_score(y_res_test, predictions)
    precision = precision_score(y_res_test, predictions)  #?, zero_division=1

     # Calcular a curva ROC
    fpr, tpr, thresholds = roc_curve(y_res_test, predictions_prob)  # FPR e TPR para calcular a curva ROC
    roc = trapezoid(tpr, fpr)  # Calculando a área sob a curva ROC (AUC)
    
    # Armazenar os resultados
    results[name] = {
        "Precision": precision,
        "Accuracy": accuracy,
        "Recall": recall,
        "F1 Score": f1,
        "AUC": auc
    }
    
results_df = pd.DataFrame(results).T
results_df = results_df.round(3)
print(results_df)

Churn_Yes
0    5174
1    5174
Name: count, dtype: int64
Churn_Yes
1    883
0    117
Name: count, dtype: int64
{0, 1}
{0, 1}
{0, 1}
{0, 1}
{0, 1}
{0}
         Precision  Accuracy  Recall  F1 Score    AUC
RF           0.638     0.149   0.084     0.148  0.377
SVM          1.000     0.470   0.400     0.540  0.525
XGBoost      1.000     0.316   0.225     0.355  0.761
DT           0.664     0.158   0.094     0.163  0.368
LR           0.927     0.760   0.790     0.793  0.766
NB           0.000     0.117   0.000     0.025  0.513


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Treino e Avaliação Treino: IBM / Teste: Kaggle (UNDERSAMPLING)

In [48]:
from scipy.integrate import trapezoid
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, recall_score, roc_curve, precision_score

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# treino IBM -> test Kaggle

X_treino = X_IBM.copy()
y_treino = y_IBM.copy()

X_test = X_kaggle.copy()
y_test = y_kaggle.copy()

#normalização
columns_to_normalize = ['Tenure', 'MonthlyCharges']
scaler = StandardScaler()
X_treino[columns_to_normalize] = scaler.fit_transform(X_treino[columns_to_normalize])
X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

#Balanceamento undersampling
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)

X_res_treino, y_res_treino = undersampler.fit_resample(X_treino, y_treino)

y_res_test = y_test
X_res_test = X_test
#X_res_test, y_res_test = smote.fit_resample(X_test, y_test)
#X_res_test, y_res_test = tomek.fit_resample(X_res_test, y_res_test)

print(y_res_treino.value_counts())
print(y_res_test.value_counts())


# Modelos de Classificação
classifiers = {
    "RF": RandomForestClassifier(random_state=42),
    "SVM": SVC(kernel='linear', random_state=42, probability=True),
    "XGBoost": XGBClassifier(random_state=42, n_estimators=3, max_depth=2, learning_rate=1, objective='binary:logistic'),
    "DT": DecisionTreeClassifier(random_state=42),
    "LR": LogisticRegression(random_state=42, max_iter=1000),
    "NB": GaussianNB()
}

# Resultados
results = {}

# Avaliar Classificação
for name, model in classifiers.items():
    # Treinar com o primeiro dataset
    model.fit(X_res_treino, y_res_treino)
    
    # Avaliar com o segundo dataset
    predictions = model.predict(X_res_test)
    #print(set(predictions))
    #print("Previsões: ", predictions)
    print("Contagem das previsões da classe 1: ", sum(predictions == 1))
    predictions_prob = model.predict_proba(X_res_test)[:, 1]  # Probabilidades para calcular AUC

    # Calcular as métricas
    f1 = f1_score(y_res_test, predictions, average='weighted')
    auc = roc_auc_score(y_res_test, predictions_prob)
    accuracy = accuracy_score(y_res_test, predictions)
    recall = recall_score(y_res_test, predictions)
    precision = precision_score(y_res_test, predictions, zero_division=0)  #?

     # Calcular a curva ROC
    fpr, tpr, thresholds = roc_curve(y_res_test, predictions_prob)  # FPR e TPR para calcular a curva ROC
    roc = trapezoid(tpr, fpr)  # Calculando a área sob a curva ROC (AUC)
    
    # Armazenar os resultados
    results[name] = {
        "Precision": precision,
        "Accuracy": accuracy,
        "Recall": recall,
        "F1 Score": f1,
        "AUC": auc
    }
    
results_df = pd.DataFrame(results).T
results_df = results_df.round(3)
print(results_df)

Churn_Yes
0    1869
1    1869
Name: count, dtype: int64
Churn_Yes
1    883
0    117
Name: count, dtype: int64
Contagem das previsões da classe 1:  101
Contagem das previsões da classe 1:  353
Contagem das previsões da classe 1:  199
Contagem das previsões da classe 1:  272
Contagem das previsões da classe 1:  719
Contagem das previsões da classe 1:  0
         Precision  Accuracy  Recall  F1 Score    AUC
RF           0.584     0.134   0.067     0.123  0.386
SVM          1.000     0.470   0.400     0.540  0.732
XGBoost      1.000     0.316   0.225     0.355  0.655
DT           0.846     0.305   0.260     0.372  0.451
LR           0.929     0.734   0.757     0.775  0.769
NB           0.000     0.117   0.000     0.025  0.516


# Treino e Avaliação Treino: Kaggle / Teste: IBM (OVERSAMPLING)

In [55]:
from scipy.integrate import trapezoid
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, recall_score, roc_curve, precision_score

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


# treino Kaggle -> test IBM

X_treino = X_kaggle.copy()
y_treino = y_kaggle.copy()

X_test = X_IBM.copy()
y_test = y_IBM.copy()

#redução
X_treino = X_treino.sample(n=700, random_state=42)
y_treino = y_treino[X_treino.index]

X_test = X_test.sample(n=700, random_state=42)
y_test = y_test[X_test.index]

#normalização
columns_to_normalize = ['Tenure', 'MonthlyCharges']
scaler = StandardScaler()
X_treino[columns_to_normalize] = scaler.fit_transform(X_treino[columns_to_normalize])
X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

#Balanceamento 
smote = SMOTE(sampling_strategy='auto', random_state=42)
tomek = TomekLinks(sampling_strategy='majority')

X_res_treino, y_res_treino = smote.fit_resample(X_treino, y_treino)
#X_res_treino, y_res_treino = tomek.fit_resample(X_res_treino, y_res_treino)

X_res_test = X_test
y_res_test = y_test

#X_res_test, y_res_test = smote.fit_resample(X_test, y_test)
#X_res_test, y_res_test = tomek.fit_resample(X_res_test, y_res_test)

print(y_res_treino.value_counts())
print(y_res_test.value_counts())

# Modelos de Classificação
classifiers = {
    "RF": RandomForestClassifier(random_state=42),
    "SVM": SVC(kernel='linear', random_state=42, probability=True),
    "XGBoost": XGBClassifier(random_state=42, n_estimators=3, max_depth=2, learning_rate=1, objective='binary:logistic'),
    "DT": DecisionTreeClassifier(random_state=42),
    "LR": LogisticRegression(random_state=42, max_iter=1000),
    "NB": GaussianNB()
}

# Resultados
results = {}

# Avaliar Classificação
for name, model in classifiers.items():
    # Treinar com o primeiro dataset
    model.fit(X_res_treino, y_res_treino)
    
    # Avaliar com o segundo dataset
    predictions = model.predict(X_res_test)
    predictions_prob = model.predict_proba(X_res_test)[:, 1]  # Probabilidades para calcular AUC

    # Calcular as métricas
    f1 = f1_score(y_res_test, predictions, average='weighted')
    auc = roc_auc_score(y_res_test, predictions_prob)
    accuracy = accuracy_score(y_res_test, predictions)
    recall = recall_score(y_res_test, predictions)
    precision = precision_score(y_res_test, predictions)

     # Calcular a curva ROC
    fpr, tpr, thresholds = roc_curve(y_res_test, predictions_prob)
    roc = trapezoid(tpr, fpr)
    
    # Armazenar os resultados
    results[name] = {
        "Precision": precision,
        "Accuracy": accuracy,
        "Recall": recall,
        "F1 Score": f1,
        "AUC": auc
    }
results_df = pd.DataFrame(results).T
results_df = results_df.round(3)
print(results_df)

Churn_Yes
1    610
0    610
Name: count, dtype: int64
Churn_Yes
0    508
1    192
Name: count, dtype: int64
         Precision  Accuracy  Recall  F1 Score    AUC
RF           0.274     0.274   1.000     0.118  0.560
SVM          0.285     0.326   0.969     0.231  0.677
XGBoost      0.274     0.274   1.000     0.118  0.463
DT           0.274     0.274   1.000     0.118  0.500
LR           0.323     0.439   0.953     0.413  0.752
NB           0.323     0.439   0.953     0.413  0.598


# Treino e Avaliação Treino: Kaggle / Teste: IBM (UNDERSAMPLING)

In [54]:
from scipy.integrate import trapezoid
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, recall_score, roc_curve, precision_score

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


# treino Kaggle -> test IBM

X_treino = X_kaggle.copy()
y_treino = y_kaggle.copy()

X_test = X_IBM.copy()
y_test = y_IBM.copy()

#Redução
X_treino = X_treino.sample(n=700, random_state=42)
y_treino = y_treino[X_treino.index]

X_test = X_test.sample(n=700, random_state=42)
y_test = y_test[X_test.index]

#normalização
columns_to_normalize = ['Tenure', 'MonthlyCharges']
scaler = StandardScaler()
X_treino[columns_to_normalize] = scaler.fit_transform(X_treino[columns_to_normalize])
X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

#Balanceamento 
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)

X_res_treino, y_res_treino = undersampler.fit_resample(X_treino, y_treino)

X_res_test = X_test
y_res_test = y_test

#X_res_test = X_test.sample(n=100, random_state=42)
#y_res_test = y_test[X_res_test.index]


#X_res_test, y_res_test = smote.fit_resample(X_test, y_test)
#X_res_test, y_res_test = tomek.fit_resample(X_res_test, y_res_test)

print(y_res_treino.value_counts())
print(y_res_test.value_counts())

# Modelos de Classificação
classifiers = {
    "RF": RandomForestClassifier(random_state=42),
    "SVM": SVC(kernel='linear', random_state=42, probability=True),
    "XGBoost": XGBClassifier(random_state=42, n_estimators=3, max_depth=2, learning_rate=1, objective='binary:logistic'),
    "DT": DecisionTreeClassifier(random_state=42),
    "LR": LogisticRegression(random_state=42, max_iter=1000),
    "NB": GaussianNB()
}

# Resultados
results = {}

# Avaliar Classificação
for name, model in classifiers.items():
    # Treinar com o primeiro dataset
    model.fit(X_res_treino, y_res_treino)
    
    # Avaliar com o segundo dataset
    predictions = model.predict(X_res_test)
    predictions_prob = model.predict_proba(X_res_test)[:, 1]  # Probabilidades para calcular AUC

    # Calcular as métricas
    f1 = f1_score(y_res_test, predictions, average='weighted')
    auc = roc_auc_score(y_res_test, predictions_prob)
    accuracy = accuracy_score(y_res_test, predictions)
    recall = recall_score(y_res_test, predictions)
    precision = precision_score(y_res_test, predictions)

     # Calcular a curva ROC
    fpr, tpr, thresholds = roc_curve(y_res_test, predictions_prob)
    roc = trapezoid(tpr, fpr)
    
    # Armazenar os resultados
    results[name] = {
        "Precision": precision,
        "Accuracy": accuracy,
        "Recall": recall,
        "F1 Score": f1,
        "AUC": auc
    }
results_df = pd.DataFrame(results).T
results_df = results_df.round(3)
print(results_df)

Churn_Yes
0    90
1    90
Name: count, dtype: int64
Churn_Yes
0    508
1    192
Name: count, dtype: int64
         Precision  Accuracy  Recall  F1 Score    AUC
RF           0.274     0.274   1.000     0.118  0.664
SVM          0.323     0.439   0.953     0.413  0.711
XGBoost      0.274     0.274   1.000     0.118  0.580
DT           0.274     0.274   1.000     0.118  0.500
LR           0.331     0.466   0.927     0.454  0.715
NB           0.323     0.439   0.953     0.413  0.596
