<a href="https://colab.research.google.com/github/Denilza/Cientista-de-Dados/blob/main/Projeto_Churn_Denilza_Lima.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Projeto: Predição de Cancelamento de Clientes

%pip install -q gdown

import gdown
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE

# 1. Download dos dados
url = 'https://drive.google.com/uc?id=1UYT9120H6kklfXAFGsciR3J2teE27hwV'
output = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'
gdown.download(url, output, quiet=False)

# 2. Carregando os dados
df = pd.read_csv(output)
print(df.head())


Downloading...
From: https://drive.google.com/uc?id=1UYT9120H6kklfXAFGsciR3J2teE27hwV
To: /content/WA_Fn-UseC_-Telco-Customer-Churn.csv
100%|██████████| 978k/978k [00:00<00:00, 100MB/s]

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape




In [None]:
# 2. Pré-processamento
df.dropna(inplace=True)
df['Contract PaperlessBilling'] = df['Contract PaperlessBilling'].astype('category').cat.codes

X = df.drop("Churn", axis=1)
y = df["Churn"]

# Balanceamento com SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 3. Divisão dos dados
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 4. Escalonamento
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. Modelagem com Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)
print("Random Forest Results:")
print(classification_report(y_test, y_pred_rf))

# 6. Modelagem com XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train_scaled, y_train)
y_pred_xgb = xgb.predict(X_test_scaled)
print("XGBoost Results:")
print(classification_report(y_test, y_pred_xgb))

# 7. Métrica ROC-AUC
roc_rf = roc_auc_score(y_test, rf.predict_proba(X_test_scaled)[:, 1])
roc_xgb = roc_auc_score(y_test, xgb.predict_proba(X_test_scaled)[:, 1])

print(f"ROC-AUC Random Forest: {roc_rf:.2f}")
print(f"ROC-AUC XGBoost: {roc_xgb:.2f}")

# 8. Matriz de confusão
sns.heatmap(confusion_matrix(y_test, y_pred_xgb), annot=True, fmt='d')
plt.title("Confusion Matrix - XGBoost")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


KeyError: 'Contract PaperlessBilling'