# Setup

In [18]:
!pip install lime




[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import KFold, cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, recall_score

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# _

In [20]:
numeric_features = ["MonthlyCharges", # common1
                    #'Age', # common7
                    #'TotalCharges', # common4
                    #"Tenure" # common5
                    ]
categorical_features = ['ContractType', # common2
                        'InternetService' # common3
                        ]
binary_features = ["TechSupport", # common8
                  "Churn",  # GOAL
                  #"Gender" # common6
                  ]
features = numeric_features + categorical_features + binary_features

In [21]:
df_IBM = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.xls')
selected_columns = [
    'MonthlyCharges',
    'Contract',
    'InternetService',
    'TechSupport',
    'Churn'
]
# Filtrar atributos do dataset
df_IBM = df_IBM[selected_columns]
df_IBM = df_IBM.rename(columns={'Contract': 'ContractType'})
df_IBM['TechSupport'] = df_IBM['TechSupport'].replace('No internet service', 'No')
df_IBM['InternetService'] = df_IBM['InternetService'].replace('Fiber optic', 'Fiber Optic')
df_IBM['ContractType'] = df_IBM['ContractType'].replace('Month-to-month', 'Month-to-Month')
df_IBM['ContractType'] = df_IBM['ContractType'].replace('Two year', 'Two-Year')
df_IBM['ContractType'] = df_IBM['ContractType'].replace('One year', 'One-Year')
# Tornar em binário os valores categoricos através do one-hot encoding
df_IBM = pd.get_dummies(df_IBM, columns=['ContractType', 'InternetService'], drop_first=False, dtype=int)
df_IBM = pd.get_dummies(df_IBM, columns=['TechSupport', 'Churn'], drop_first=True, dtype=int)


df = pd.read_csv('data/customer_churn_data.csv')
selected_columns = [
    'MonthlyCharges',
    'ContractType',
    'InternetService',
    'TechSupport',
    'Churn'
]
# Filtrar atributos do dataset
df = df[selected_columns]
# Substituir os valores NaN por "Nenhum"
df['InternetService'] = df['InternetService'].fillna('No')
# Tornar em binário os valores categoricos através do one-hot encoding
df = pd.get_dummies(df, columns=['ContractType', 'InternetService'], drop_first=False, dtype=int)
df = pd.get_dummies(df, columns=["TechSupport", "Churn"], drop_first=True, dtype=int)


# Reordenar o segundo dataset para ter a mesma ordem do primeiro
df_IBM = df_IBM[df.columns]
# Preparação dos dados no formato de treino
X_IBM = df_IBM.drop(columns=['Churn_Yes'])
y_IBM = df_IBM['Churn_Yes']
# Preparação dos dados no formato de treino
X_kaggle = df.drop(columns=['Churn_Yes'])
y_kaggle = df['Churn_Yes']
# Dataset que junta tudo
df = pd.concat([df_IBM,df],axis=0, ignore_index=True)
df['Churn_Yes'].value_counts(normalize=True) * 100
X = df.drop(columns=['Churn_Yes'])
y = df['Churn_Yes']

## Cross-validation

In [25]:
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

metrics = ['accuracy', 'f1_weighted', 'precision', 'recall', 'roc_auc']
scores = {metric: [] for metric in metrics}

classifiers = {
    "NB": GaussianNB(),
    "LR": LogisticRegression(random_state=42),
    "SVM": SVC(random_state=42, probability=True),
    "DT": DecisionTreeClassifier(random_state=42),
    "RF": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
}

results = {}
for name, model in classifiers.items():
    scores = {metric: [] for metric in metrics}
    for train_index, test_index in cv.split(X, y):              # Podes comentar isto e usar as
        X_train, X_test = X.loc[train_index], X.loc[test_index] #   linhas de baixo para treinar
        y_train, y_test = y.loc[train_index], y.loc[test_index] #   com um e testar com o outro.
        # X_train, X_test = X_IBM, X_kaggle
        # y_train, y_test = y_IBM, y_kaggle

        scaler = StandardScaler()
        X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
        X_test[numeric_features] = scaler.transform(X_test[numeric_features])

        resampler = RandomUnderSampler(random_state=42)
        X_train_balanced, y_train_balanced = resampler.fit_resample(X_train, y_train)

        #X_train_balanced = X_train
        #y_train_balanced = y_train

        model.fit(X_train_balanced, y_train_balanced)

        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

        scores['accuracy'].append(accuracy_score(y_test, y_pred))
        scores['f1_weighted'].append(f1_score(y_test, y_pred))
        scores['precision'].append(precision_score(y_test, y_pred))
        scores['recall'].append(recall_score(y_test, y_pred))
        scores['roc_auc'].append(roc_auc_score(y_test, y_prob))

    results[name] = {
        "Accuracy Mean": np.mean(scores['accuracy']),
        #"Accuracy Std": np.std(scores['accuracy']),
        "F1 Score Mean": np.mean(scores['f1_weighted']),
        #"F1 Score Std": np.std(scores['f1_weighted']),
        "Precision Mean": np.mean(scores['precision']),
        #"Precision Std": np.std(scores['precision']),
        "Recall Mean": np.mean(scores['recall']),
        #"Recall Std": np.std(scores['recall']),
        "AUC Mean": np.mean(scores['roc_auc']),
        #"AUC Std": np.std(scores['roc_auc']),
    }

results_df = pd.DataFrame(results).T
results_df = results_df.round(4)
results_df

Unnamed: 0,Accuracy Mean,F1 Score Mean,Precision Mean,Recall Mean,AUC Mean
NB,0.6912,0.6217,0.5352,0.7416,0.7424
LR,0.6631,0.6002,0.5058,0.7398,0.7531
SVM,0.7237,0.6609,0.5695,0.7874,0.8024
DT,0.6796,0.5764,0.5263,0.637,0.6927
RF,0.682,0.6011,0.5265,0.7002,0.7671
XGBoost,0.7109,0.6412,0.5573,0.7547,0.7981


In [23]:
import pickle

# Normalizar o dataset todo
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[numeric_features] = scaler.fit_transform(X_scaled[numeric_features])

# Balancear
resampler = RandomUnderSampler(random_state=42)
X_balanced, y_balanced = resampler.fit_resample(X_scaled, y)

# Para guardar o scaler também
with open('C:/Users/franc/Desktop/git_proj_curso/ProjetoLEI/Website/scalers/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Treinar e guardar os modelos
for name, model in classifiers.items():
    model.fit(X_balanced, y_balanced)
    with open(f'C:/Users/franc/Desktop/git_proj_curso/ProjetoLEI/Website/models/{name}_model.pkl', 'wb') as f:
        pickle.dump(model, f)

print("Modelos e scaler salvos com sucesso.")

Modelos e scaler salvos com sucesso.


In [24]:
X_balanced.head()

Unnamed: 0,MonthlyCharges,ContractType_Month-to-Month,ContractType_One-Year,ContractType_Two-Year,InternetService_DSL,InternetService_Fiber Optic,InternetService_No,TechSupport_Yes
563,1.12751,0,0,1,0,1,0,0
1496,-1.536544,1,0,0,0,0,1,0
1099,-0.344863,1,0,0,1,0,0,1
4277,-1.513013,1,0,0,0,0,1,0
7067,0.450487,0,1,0,1,0,0,1
