In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

is_v2 = True

#### **Traitement de la base de donnée**

In [None]:
# Load the dataset
df = pd.read_csv('dataset/telco_dataset.csv')
df.head()

In [None]:
def audit(df):
    return pd.DataFrame({
        "dtype": df.dtypes,
        "missing": df.isna().sum(),
        "missing_%": (df.isna().mean() * 100).round(2),
        "unique": df.nunique(),
        "sample": df.iloc[0]
    })
audit(df)

In [None]:
#on transforme les variables yes / no en binaire 
binary_cols = ["gender",'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
mapping = {'Yes': 1, 'No': 0, 'Female': 1, 'Male': 0}
for col in binary_cols:
    df[col] = df[col].str.capitalize().map(mapping)    
df[binary_cols].head()

In [None]:
# Encodage Ordinal pour le Contrat (Il y a une notion d'ordre/durée)
dummy_cols = []
if(is_v2) :
    contract_mapping = {'Month-to-month': 0, 'One year': 1, 'Two year': 2}
    df['Contract_Ordinal'] = df['Contract'].map(contract_mapping)
    dummy_cols = ["MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport","StreamingTV" ,"StreamingMovies", "PaymentMethod"]
    df = pd.get_dummies(df, columns= dummy_cols,dtype=int, drop_first=True)
else :
    dummy_cols = ["MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport","StreamingTV" ,"StreamingMovies", "Contract", "PaymentMethod"]
    df = pd.get_dummies(df, columns= dummy_cols,dtype=int)

df.head()

In [None]:
df.describe()

In [None]:
audit(df)

on remarque que totalcharge est de type objext, certain champs sont vide 

In [None]:
# Conversion en numérique, les espaces deviennent des NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [None]:
df.isna().sum()

In [None]:
# Traitement de la colonne 'TotalCharges'
# Remplissage des NaN par 0 (car ce sont des nouveaux clients n'ayant rien payé)
df['TotalCharges'] = df['TotalCharges'].fillna(0)

In [None]:
df.drop(columns=['customerID'], inplace=True)

In [None]:
if(is_v2):
    df.to_csv("dataset/cleaned_dataset_v2.csv")
else :
    df.to_csv("dataset/cleaned_dataset.csv")

#### **Analyse descriptive et visualtion de la base de donnée**

In [None]:
base=pd.read_csv("dataset/cleaned_dataset.csv")
base.head()

##### Analyse sur les variables quantitatives 

In [None]:
# On sélectionne les variables numériques
num_vars = ["tenure", "MonthlyCharges", "TotalCharges"]
print(base[num_vars].describe().to_latex())

In [None]:
summary_num = (df[num_vars].agg(["count", "mean", "median", "std", "min", "max"]).T)
# Ajoute Q1 et Q3 
summary_num["q1"] = df[num_vars].quantile(0.25)
summary_num["q3"] = df[num_vars].quantile(0.75)
# Réordonner les colonnes façon R
summary_num = summary_num[["count", "mean", "median", "std", "min", "q1", "q3", "max"]].round(2)
print(summary_num.to_latex(
        caption="Statistiques descriptives des variables numériques",
        label="tab:stats_num",
        column_format="lrrrrrrrr"))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid",context="paper")
plt.rcParams["font.family"] = "sans-serif"

palette = ["#3498db", "#2ecc71", "#e74c3c"]
num_vars = ["tenure", "MonthlyCharges", "TotalCharges"]
fig, axes = plt.subplots(1, 3,figsize=(15, 5),constrained_layout=True)
for i, col in enumerate(num_vars):
    sns.histplot(data=base,x=col,bins=30,ax=axes[i],color=palette[i],edgecolor="white",linewidth=0.6)
    axes[i].set_title(col, fontsize=11, pad=8)
    axes[i].set_xlabel("")
    axes[i].set_ylabel("Fréquence" if i == 0 else "")
    axes[i].spines["top"].set_visible(False)
    axes[i].spines["right"].set_visible(False)
    axes[i].grid(axis="y", alpha=0.3)
fig.suptitle("Distribution des variables",fontsize=13)
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# Calcul de la matrice de corrélation
corr_matrix = base[num_vars].corr()
fig, ax = plt.subplots(figsize=(8, 6), facecolor='white')
sns.heatmap(corr_matrix,annot=True,fmt='.2f',cmap="coolwarm",center=0,square=True,linewidths=1,linecolor='white',cbar_kws={'shrink': 0.8},ax=ax,vmin=-1,vmax=1)
ax.set_title("Matrice de corrélation", fontsize=14, pad=15)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

##### Analyse sur les variables qualitatives

In [None]:
base.columns