# Setup

In [111]:
!pip install lime




[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [112]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import KFold, cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, recall_score

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# _

In [113]:
dataset = "ibm"  # ["kaggle", "ibm", "mix"] # TODO: Implement mix strategy

### Data Loading

In [114]:
if dataset == "kaggle":
  numeric_features = ["MonthlyCharges",
                      'Age',
                      #'TotalCharges',
                      #"Tenure"
                      ]
  categorical_features = ['ContractType',
                          'InternetService'
                          ]
  binary_features = ["TechSupport",
                    "Churn",
                    #"Gender"
                    ]
  features = numeric_features + categorical_features + binary_features

  df = pd.read_csv('data/customer_churn_data.csv', usecols = features)
  if 'InternetService' in df.columns:
    df['InternetService'] = df['InternetService'].fillna('No')

In [115]:
if dataset == "ibm":
  numeric_features = ['tenure',
                      'MonthlyCharges',
                      #'TotalCharges'
                      ]
  categorical_features = ['InternetService',
                          'Contract',
                          'PaymentMethod',
                          
                          ]
  binary_features = [#'gender',
                    #'SeniorCitizen',
                    #'Partner',
                    'Dependents',
                    #'PhoneService',
                    'OnlineSecurity',
                    #'OnlineBackup',
                    #'DeviceProtection',
                    'TechSupport',
                    'PaperlessBilling',
                    #'StreamingTV',
                    #'StreamingMovies',
                    'Churn',
                    #'MultipleLines'
                    ]
  features = numeric_features + categorical_features + binary_features

  df = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.xls', usecols = features)
  if 'MultipleLines' in df.columns:
    df['MultipleLines'] = df['MultipleLines'].replace('No phone service', 'No')

  


    

In [116]:
df['OnlineSecurity'].value_counts()

OnlineSecurity
No                     3498
Yes                    2019
No internet service    1526
Name: count, dtype: int64

In [117]:
df['TechSupport'] = df['TechSupport'].replace('No internet service', 'No')
df['OnlineSecurity'] = df['OnlineSecurity'].replace('No internet service', 'No')

In [118]:
df['OnlineSecurity'].value_counts()

OnlineSecurity
No     5024
Yes    2019
Name: count, dtype: int64

## Cross-validation

In [119]:
df = pd.get_dummies(df, columns=categorical_features, drop_first=False, dtype=int)
df = pd.get_dummies(df, columns=binary_features, drop_first=True, dtype=int)
X = df.drop(columns=['Churn_Yes'])
y = df['Churn_Yes']

In [120]:
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

metrics = ['accuracy', 'f1_weighted', 'precision', 'recall', 'roc_auc']
scores = {metric: [] for metric in metrics}

classifiers = {
    "NB": GaussianNB(),
    "LR": LogisticRegression(random_state=42),
    "SVM": SVC(random_state=42, probability=True),
    "DT": DecisionTreeClassifier(random_state=42),
    "RF": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
}

results = {}
for name, model in classifiers.items():
    scores = {metric: [] for metric in metrics}
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        scaler = StandardScaler()
        X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
        X_test[numeric_features] = scaler.transform(X_test[numeric_features])

        resampler = RandomUnderSampler(random_state=42)
        X_train_balanced, y_train_balanced = resampler.fit_resample(X_train, y_train)

        #X_train_balanced = X_train
        #y_train_balanced = y_train

        model.fit(X_train_balanced, y_train_balanced)

        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

        scores['accuracy'].append(accuracy_score(y_test, y_pred))
        scores['f1_weighted'].append(f1_score(y_test, y_pred))
        scores['precision'].append(precision_score(y_test, y_pred))
        scores['recall'].append(recall_score(y_test, y_pred))
        scores['roc_auc'].append(roc_auc_score(y_test, y_prob))

    results[name] = {
        "Accuracy Mean": np.mean(scores['accuracy']),
        #"Accuracy Std": np.std(scores['accuracy']),
        "F1 Score Mean": np.mean(scores['f1_weighted']),
        #"F1 Score Std": np.std(scores['f1_weighted']),
        "Precision Mean": np.mean(scores['precision']),
        #"Precision Std": np.std(scores['precision']),
        "Recall Mean": np.mean(scores['recall']),
        #"Recall Std": np.std(scores['recall']),
        "AUC Mean": np.mean(scores['roc_auc']),
        #"AUC Std": np.std(scores['roc_auc']),
    }

results_df = pd.DataFrame(results).T
results_df = results_df.round(4)
results_df

Unnamed: 0,Accuracy Mean,F1 Score Mean,Precision Mean,Recall Mean,AUC Mean
NB,0.726,0.6108,0.4901,0.8106,0.8316
LR,0.7483,0.6254,0.5167,0.7924,0.8397
SVM,0.7417,0.6134,0.5087,0.7726,0.8206
DT,0.676,0.5217,0.429,0.6656,0.6735
RF,0.7201,0.5747,0.4815,0.7127,0.8015
XGBoost,0.723,0.5853,0.4855,0.7368,0.8005


In [121]:
print(X.head())
import pickle

# Normalizar o dataset todo
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[numeric_features] = scaler.fit_transform(X_scaled[numeric_features])

# Balancear
resampler = RandomUnderSampler(random_state=42)
X_balanced, y_balanced = resampler.fit_resample(X_scaled, y)

# Para guardar o scaler também
with open('C:/Users/franc/Desktop/git_proj_curso/ProjetoLEI/Website/scalers/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Treinar e guardar os modelos
for name, model in classifiers.items():
    model.fit(X_balanced, y_balanced)
    with open(f'C:/Users/franc/Desktop/git_proj_curso/ProjetoLEI/Website/models/{name}_model.pkl', 'wb') as f:
        pickle.dump(model, f)

print("Modelos e scaler salvos com sucesso.")

   tenure  MonthlyCharges  InternetService_DSL  InternetService_Fiber optic  \
0       1           29.85                    1                            0   
1      34           56.95                    1                            0   
2       2           53.85                    1                            0   
3      45           42.30                    1                            0   
4       2           70.70                    0                            1   

   InternetService_No  Contract_Month-to-month  Contract_One year  \
0                   0                        1                  0   
1                   0                        0                  1   
2                   0                        1                  0   
3                   0                        0                  1   
4                   0                        1                  0   

   Contract_Two year  PaymentMethod_Bank transfer (automatic)  \
0                  0                         