# IBM

In [5]:
import pandas as pd
# Lendo o arquivo XLS
df_IBM = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.xls')

selected_columns = [
    'tenure', 
    'MonthlyCharges', 
    'Contract', 
    'InternetService', 
    'TechSupport', 
    'Churn'
]
# Filtrar atributos do dataset
df_IBM = df_IBM[selected_columns]

# Nomear os atributos de maneira a serem compativeis entre datasets
df_IBM = df_IBM.rename(columns={'tenure': 'Tenure'})
df_IBM = df_IBM.rename(columns={'Contract': 'ContractType'})

df_IBM['TechSupport'] = df_IBM['TechSupport'].replace('No internet service', 'No')
df_IBM['InternetService'] = df_IBM['InternetService'].replace('Fiber optic', 'Fiber Optic')
df_IBM['ContractType'] = df_IBM['ContractType'].replace('Month-to-month', 'Month-to-Month')
df_IBM['ContractType'] = df_IBM['ContractType'].replace('Two year', 'Two-Year')
df_IBM['ContractType'] = df_IBM['ContractType'].replace('One year', 'One-Year')


# Tornar em binário os valores categoricos através do one-hot encoding
df_IBM = pd.get_dummies(df_IBM, columns=['ContractType', 'InternetService'], drop_first=False, dtype=int)
df_IBM = pd.get_dummies(df_IBM, columns=['TechSupport', 'Churn'], drop_first=True, dtype=int)

from sklearn.preprocessing import StandardScaler

columns_to_normalize = ['Tenure', 'MonthlyCharges']
scaler = StandardScaler()

# Normalizar colunas numéricas
df_IBM[columns_to_normalize] = scaler.fit_transform(df_IBM[columns_to_normalize])

# Preparação dos dados no formato de treino
X_ibm = df_IBM.drop(columns=['Churn_Yes'])
y_ibm = df_IBM['Churn_Yes']





# Kaggle

In [6]:
import pandas as pd

df = pd.read_csv('data/customer_churn_data.csv')

selected_columns = [
    'Tenure', 
    'MonthlyCharges', 
    'ContractType', 
    'InternetService', 
    'TechSupport', 
    'Churn'
]
# Filtrar atributos do dataset
df = df[selected_columns]

# Substituir os valores NaN por "Nenhum"
df['InternetService'] = df['InternetService'].fillna('No')
# Tornar em binário os valores categoricos através do one-hot encoding
df = pd.get_dummies(df, columns=['ContractType', 'InternetService'], drop_first=False, dtype=int)
df = pd.get_dummies(df, columns=["TechSupport", "Churn"], drop_first=True, dtype=int)

from sklearn.preprocessing import StandardScaler

columns_to_normalize = [ 'Tenure', 'MonthlyCharges']

# Normalizar colunas numéricas
df[columns_to_normalize] = scaler.transform(df[columns_to_normalize])

# Preparação dos dados no formato de treino
X_test = df.drop(columns=['Churn_Yes'])
y_test = df['Churn_Yes']

# Reordenar o segundo dataset para ter a mesma ordem do primeiro
df_IBM = df_IBM[df.columns]
from sklearn.preprocessing import StandardScaler

columns_to_normalize = [ 'Tenure', 'MonthlyCharges']

# Normalizar colunas numéricas
df[columns_to_normalize] = scaler.transform(df[columns_to_normalize])

# Preparação dos dados no formato de treino
X_kaggle = df.drop(columns=['Churn_Yes'])
y_kaggle = df['Churn_Yes']

# Reordenar o segundo dataset para ter a mesma ordem do primeiro
df_IBM = df_IBM[df.columns]

# Treino e Avaliação

In [7]:
from sklearn.model_selection import KFold, cross_validate
import numpy as np

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

#X=X_ibm
#y=y_ibm

#X=X_kaggle
#y=y_kaggle

X = pd.concat([X_ibm, X_kaggle], axis=0, ignore_index=True)
y = pd.concat([y_ibm, y_kaggle], axis=0, ignore_index=True)

# Configurar a validação cruzada
cv = KFold(n_splits=10, shuffle=True, random_state=42)
# Modelos de Classificação
classifiers = {
    "RF": RandomForestClassifier(random_state=42),
    "SVM": SVC(kernel='linear', random_state=42, probability=True),
    "XGBoost": XGBClassifier(random_state=42, n_estimators=200, max_depth=10, learning_rate=1, objective='binary:logistic'),
    "DT": DecisionTreeClassifier(random_state=42),
    "LR": LogisticRegression(random_state=42, max_iter=1000),
    "NB": GaussianNB()
}

# Resultados
results = {}

# Avaliar Classificação com validação cruzada
for name, model in classifiers.items():
    cv_results = cross_validate(model, X, y, cv=cv,
                                scoring=['accuracy', 'precision', 'f1_weighted', 'roc_auc', 'recall'],
                                return_train_score=False)
    
    results[name] = {
        "Precision Mean": np.mean(cv_results['test_precision']),
        "Precision Std": np.std(cv_results['test_precision']),
        "Accuracy Mean": np.mean(cv_results['test_accuracy']),
        "Accuracy Std": np.std(cv_results['test_accuracy']),
        "Recall Mean": np.mean(cv_results['test_recall']),
        "Recall Std": np.std(cv_results['test_recall']),
        "F1 Score Mean": np.mean(cv_results['test_f1_weighted']),
        "F1 Score Std": np.std(cv_results['test_f1_weighted']),
        "AUC Mean": np.mean(cv_results['test_roc_auc']),
        "AUC Std": np.std(cv_results['test_roc_auc']),
        
    }

# Converter os resultados num DataFrame
results_df = pd.DataFrame(results).T  # Transpor para ter os modelos como linhas
results_df = results_df.round(4)

In [8]:
# Exibir tabela de resultados
print("\nResultados:")
print(results_df.round(4))


Resultados:
         Precision Mean  Precision Std  Accuracy Mean  Accuracy Std  \
RF               0.7212         0.0297         0.7941        0.0109   
SVM              0.6921         0.0211         0.7664        0.0189   
XGBoost          0.6897         0.0280         0.7828        0.0139   
DT               0.6584         0.0286         0.7658        0.0147   
LR               0.7068         0.0238         0.7815        0.0182   
NB               0.5305         0.0270         0.6885        0.0171   

         Recall Mean  Recall Std  F1 Score Mean  F1 Score Std  AUC Mean  \
RF            0.6503      0.0227         0.7914        0.0114    0.8591   
SVM           0.5721      0.0395         0.7602        0.0208    0.8414   
XGBoost       0.6665      0.0231         0.7819        0.0141    0.8508   
DT            0.6572      0.0349         0.7657        0.0154    0.7479   
LR            0.6190      0.0341         0.7777        0.0191    0.8399   
NB            0.7715      0.0311       