In [2]:
import pandas as pd                    # Per caricare e manipolare dati tabulari
import numpy as np                     # Per calcoli numerici e NaN handling
import matplotlib.pyplot as plt        # Per grafici base
import seaborn as sns                  # Per grafici avanzati e EDA
from sklearn.model_selection import train_test_split  # Per dividere dati (lo usiamo dopo)

In [16]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

print(df.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [17]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [18]:
print(df.describe())

       SeniorCitizen       tenure  MonthlyCharges
count    7043.000000  7043.000000     7043.000000
mean        0.162147    32.371149       64.761692
std         0.368612    24.559481       30.090047
min         0.000000     0.000000       18.250000
25%         0.000000     9.000000       35.500000
50%         0.000000    29.000000       70.350000
75%         0.000000    55.000000       89.850000
max         1.000000    72.000000      118.750000


In [19]:
print(df['Churn'].value_counts())

Churn
No     5174
Yes    1869
Name: count, dtype: int64


In [22]:
print("Quanti clienti hanno fatto Churn?")
print(df['Churn'].value_counts())
print("\nProporzione (%):")
print(df['Churn'].value_counts(normalize=True) * 100)

Quanti clienti hanno fatto Churn?
Churn
No     5174
Yes    1869
Name: count, dtype: int64

Proporzione (%):
Churn
No     73.463013
Yes    26.536987
Name: proportion, dtype: float64


In [24]:
print("=== VERIFICA DATI PULITI ===")
print("Shape:", df.shape)
print("Target pronto:", df['Churn'].dtype, df['Churn'].unique())
print("TotalCharges OK:", df['TotalCharges'].dtype, "NaN:", df['TotalCharges'].isna().sum())
print("Prime righe:")
print(df[['customerID', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']].head())

=== VERIFICA DATI PULITI ===
Shape: (7043, 21)
Target pronto: object ['No' 'Yes']
TotalCharges OK: float64 NaN: 11
Prime righe:
   customerID  tenure  MonthlyCharges  TotalCharges Churn
0  7590-VHVEG       1           29.85         29.85    No
1  5575-GNVDE      34           56.95       1889.50    No
2  3668-QPYBK       2           53.85        108.15   Yes
3  7795-CFOCW      45           42.30       1840.75    No
4  9237-HQITU       2           70.70        151.65   Yes


In [25]:
print("Prima:", df['TotalCharges'].isna().sum(), "NaN")
mediana = df['TotalCharges'].median()
df['TotalCharges'].fillna(mediana, inplace=True)
print("Dopo:", df['TotalCharges'].isna().sum(), "NaN")
print("Mediana usata:", mediana)


Prima: 11 NaN
Dopo: 0 NaN
Mediana usata: 1397.475


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(mediana, inplace=True)


In [26]:
print("Churn prima:", df['Churn'].dtype, df['Churn'].unique())
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})
print("Churn dopo:", df['Churn'].dtype, df['Churn'].unique())
print("Distribuzione finale:")
print(df['Churn'].value_counts())


Churn prima: object ['No' 'Yes']
Churn dopo: int64 [0 1]
Distribuzione finale:
Churn
0    5174
1    1869
Name: count, dtype: int64


In [27]:

numeriche = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categoriche = df.select_dtypes(include=['object']).columns.tolist()


categoriche.remove('customerID')  
numeriche = [col for col in numeriche if col != 'Churn']

print("Numeriche (", len(numeriche), "):", numeriche)
print("Categoriche (", len(categoriche), "):", categoriche[:5], "...")


Numeriche ( 4 ): ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Categoriche ( 15 ): ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines'] ...


In [28]:

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1. FEATURES e TARGET
X = df[numeriche + categoriche]
y = df['Churn']
print(f"X: {X.shape}, y: {y.shape}")

# 2. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")

# 3. PIPELINE MAGIC
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeriche),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categoriche)
])

pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', LogisticRegression(random_state=42, max_iter=2000))
])

# 4. Addestra
pipeline.fit(X_train, y_train)
print("‚úÖ Modello addestrato!")

# 5. TESTA
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nüèÜ ACCURACY: {accuracy:.1%}")
print("\nüìä REPORT COMPLETO:")
print(classification_report(y_test, y_pred, target_names=['Resta (0)', 'Churn (1)']))


X: (7043, 19), y: (7043,)
Train: (5634, 19), Test: (1409, 19)
‚úÖ Modello addestrato!

üèÜ ACCURACY: 80.6%

üìä REPORT COMPLETO:
              precision    recall  f1-score   support

   Resta (0)       0.85      0.89      0.87      1035
   Churn (1)       0.66      0.56      0.60       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409

