# Setup

In [1]:
!pip install lime



In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import KFold, cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, recall_score

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# _

In [3]:
numeric_features = ["MonthlyCharges", # common1
                    #'Age', # common7
                    #'TotalCharges', # common4
                    #"Tenure" # common5
                    ]
categorical_features = ['ContractType', # common2
                        'InternetService' # common3
                        ]
binary_features = ["TechSupport", # common8
                  "Churn",  # GOAL
                  #"Gender" # common6
                  ]
features = numeric_features + categorical_features + binary_features

In [4]:
df_IBM = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.xls')
selected_columns = [
    'MonthlyCharges',
    'Contract',
    'InternetService',
    'TechSupport',
    'Churn'
]
# Filtrar atributos do dataset
df_IBM = df_IBM[selected_columns]
df_IBM = df_IBM.rename(columns={'Contract': 'ContractType'})
df_IBM['TechSupport'] = df_IBM['TechSupport'].replace('No internet service', 'No')
df_IBM['InternetService'] = df_IBM['InternetService'].replace('Fiber optic', 'Fiber Optic')
df_IBM['ContractType'] = df_IBM['ContractType'].replace('Month-to-month', 'Month-to-Month')
df_IBM['ContractType'] = df_IBM['ContractType'].replace('Two year', 'Two-Year')
df_IBM['ContractType'] = df_IBM['ContractType'].replace('One year', 'One-Year')
# Tornar em binário os valores categoricos através do one-hot encoding
df_IBM = pd.get_dummies(df_IBM, columns=['ContractType', 'InternetService'], drop_first=False, dtype=int)
df_IBM = pd.get_dummies(df_IBM, columns=['TechSupport', 'Churn'], drop_first=True, dtype=int)


df = pd.read_csv('data/customer_churn_data.csv')
selected_columns = [
    'MonthlyCharges',
    'ContractType',
    'InternetService',
    'TechSupport',
    'Churn'
]
# Filtrar atributos do dataset
df = df[selected_columns]
# Substituir os valores NaN por "Nenhum"
df['InternetService'] = df['InternetService'].fillna('No')
# Tornar em binário os valores categoricos através do one-hot encoding
df = pd.get_dummies(df, columns=['ContractType', 'InternetService'], drop_first=False, dtype=int)
df = pd.get_dummies(df, columns=["TechSupport", "Churn"], drop_first=True, dtype=int)


# Reordenar o segundo dataset para ter a mesma ordem do primeiro
df_IBM = df_IBM[df.columns]
# Preparação dos dados no formato de treino
X_IBM = df_IBM.drop(columns=['Churn_Yes'])
y_IBM = df_IBM['Churn_Yes']
# Preparação dos dados no formato de treino
X_kaggle = df.drop(columns=['Churn_Yes'])
y_kaggle = df['Churn_Yes']
# Dataset que junta tudo
df = pd.concat([df_IBM,df],axis=0, ignore_index=True)
df['Churn_Yes'].value_counts(normalize=True) * 100
X = df.drop(columns=['Churn_Yes'])
y = df['Churn_Yes']

## Cross-validation

In [None]:
from sklearn.model_selection import LeaveOneOut
from tqdm import tqdm

from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.experimental import enable_halving_search_cv  # se necessário
from sklearn.model_selection import TunedThresholdClassifierCV

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


cv = LeaveOneOut()

metrics = ['accuracy', 'f1_weighted', 'precision', 'recall', 'roc_auc']
scores = {metric: [] for metric in metrics}


ensemble_model = VotingClassifier(
    estimators=[
        ('NB', GaussianNB()),
        ('DT', DecisionTreeClassifier(random_state=42)),
        ('XGB', XGBClassifier(random_state=42))
    ],
    voting='soft'  # Usa as probabilidades
)


classifiers = {
    #"NB": GaussianNB(),
    #"LR": LogisticRegression(random_state=42),
    #"SVM": SVC(random_state=42, probability=True),
    #"DT": DecisionTreeClassifier(random_state=42),
    #"RF": RandomForestClassifier(random_state=42),
    #"XGBoost": XGBClassifier(random_state=42)
    "Ensemble_NB_DT_XGB": ensemble_model
}


results = {}
for name, model in classifiers.items():
    scores = {metric: [] for metric in metrics}
    y_pred = []
    y_prob = []
    y_true = []
    for i, (train_index, test_index) in enumerate(tqdm(cv.split(X), total=len(X), desc=f"LOO - {name}")):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        scaler = StandardScaler()
        X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
        X_test[numeric_features] = scaler.transform(X_test[numeric_features])

        
        #resampler = RandomUnderSampler(random_state=42)
        #resampler = SMOTEENN(random_state=42
        resampler = SMOTETomek(random_state=42)

        X_train_balanced, y_train_balanced = resampler.fit_resample(X_train, y_train)
        
        model.fit(X_train_balanced, y_train_balanced)
        
        y_true.append(y_test.values[0])
        y_pred.append(model.predict(X_test)[0])
        y_prob.append(model.predict_proba(X_test)[0, 1])

    scores['accuracy'].append(accuracy_score(y_true, y_pred))
    scores['f1_weighted'].append(f1_score(y_true, y_pred))
    scores['precision'].append(precision_score(y_true, y_pred))
    scores['recall'].append(recall_score(y_true, y_pred))
    scores['roc_auc'].append(roc_auc_score(y_true, y_prob))

    results[name] = {
        "Accuracy Mean": np.mean(scores['accuracy']),
        #"Accuracy Std": np.std(scores['accuracy']),
        "F1 Score Mean": np.mean(scores['f1_weighted']),
        #"F1 Score Std": np.std(scores['f1_weighted']),
        "Precision Mean": np.mean(scores['precision']),
        #"Precision Std": np.std(scores['precision']),
        "Recall Mean": np.mean(scores['recall']),
        #"Recall Std": np.std(scores['recall']),
        "AUC Mean": np.mean(scores['roc_auc']),
        #"AUC Std": np.std(scores['roc_auc']),
    }

    
    dummy_model = LogisticRegression()
    dummy_model.fit(np.array(y_prob).reshape(-1, 1), y_true)
    
    final_model = TunedThresholdClassifierCV(
        estimator=dummy_model,
        scoring=make_scorer(f1_score, pos_label=1),
        cv=StratifiedKFold(n_splits=5)
    )
    final_model.fit(np.array(y_prob).reshape(-1, 1), y_true)
    final_preds = final_model.predict(np.array(y_prob).reshape(-1, 1))
    
    # Avaliação final:
    print(f"\n==== Resultados ajustados para: {name} ====")
    print("F1 ajustado:", f1_score(y_true, final_preds))
    print("Precision:", precision_score(y_true, final_preds))
    print("Recall:", recall_score(y_true, final_preds))
    print("ROC AUC:", roc_auc_score(y_true, y_prob))





results_df = pd.DataFrame(results).T
results_df = results_df.round(4)
results_df




LOO - Ensemble_NB_DT_XGB:   1%|▌                                                     | 90/8043 [00:22<33:45,  3.93it/s]