In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


In [None]:
personal = pd.read_csv('final_provider/personal.csv')
contract = pd.read_csv('final_provider/contract.csv')
phone = pd.read_csv('final_provider/phone.csv')
internet = pd.read_csv('final_provider/internet.csv')
df = personal.merge(contract, on='customerID') \
             .merge(phone, on='customerID', how='left') \
             .merge(internet, on='customerID', how='left')
print(df.info())
print(df.isnull().sum())
print(df.describe(include='all'))

In [None]:
df['EndDate_flag'] = df['EndDate'].apply(lambda x: 0 if x == 'No' else 1)
sns.countplot(data=df, x='EndDate_flag')
plt.title("Distribuição da Rotatividade (Churn)")
plt.show()
df_numeric = df.select_dtypes(include=['float64', 'int64'])
sns.heatmap(df_numeric.corr(), annot=True, cmap='coolwarm')
plt.title("Mapa de Correlação")
plt.show()


In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
df['MultipleLines'].fillna('No', inplace=True)

cat_cols = df.select_dtypes(include='object').columns.drop('customerID')
df[cat_cols] = df[cat_cols].fillna('No')
df_encoded = pd.get_dummies(df.drop(['customerID', 'BeginDate', 'EndDate'], axis=1), drop_first=True)

scaler = StandardScaler()
num_cols = ['MonthlyCharges', 'TotalCharges']
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

X = df_encoded.drop('EndDate_flag', axis=1)
y = df_encoded['EndDate_flag']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f'{name} - Accuracy média: {scores.mean():.4f}')


In [None]:
best_model = GradientBoostingClassifier()
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Matriz de Confusão")
plt.show()

In [None]:
X_cluster = df_encoded.drop('EndDate_flag', axis=1)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_cluster)

kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_cluster)

plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis')
plt.title('Clusterização de Clientes (PCA)')
plt.xlabel('Componente 1')
plt.ylabel('Componente 2')
plt.show()