In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the data
df = pd.read_csv('Telco-Customer-Churn.csv')

# Apply the cleaning we did in EDA
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(0, inplace=True)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Dataset shape: (7043, 21)
Columns: ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(0, inplace=True)


In [2]:
# Remove customerID (not useful for prediction)
df_clean = df.drop('customerID', axis=1)

print(f"Shape after dropping customerID: {df_clean.shape}")
print(f"\nRemaining columns: {df_clean.columns.tolist()}")

Shape after dropping customerID: (7043, 20)

Remaining columns: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [3]:
# Convert Churn from 'Yes'/'No' to 1/0
df_clean['Churn'] = df_clean['Churn'].map({'Yes': 1, 'No': 0})

print("Churn distribution:")
print(df_clean['Churn'].value_counts())

Churn distribution:
Churn
0    5174
1    1869
Name: count, dtype: int64


In [4]:
# Separate numeric and categorical columns
numeric_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()

print("Numeric columns:")
print(numeric_cols)
print(f"\nCategorical columns ({len(categorical_cols)}):")
print(categorical_cols)

Numeric columns:
['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']

Categorical columns (15):
['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [5]:
# List of binary Yes/No columns
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']

# Convert Yes/No to 1/0
for col in binary_cols:
    df_clean[col] = df_clean[col].map({'Yes': 1, 'No': 0})

# Gender: Male/Female to 1/0
df_clean['gender'] = df_clean['gender'].map({'Male': 1, 'Female': 0})

print("Binary columns encoded!")
print(df_clean[binary_cols + ['gender']].head())

Binary columns encoded!
   Partner  Dependents  PhoneService  PaperlessBilling  gender
0        1           0             0                 1       0
1        0           0             1                 0       1
2        0           0             1                 1       1
3        0           0             0                 0       1
4        0           0             1                 1       0


In [6]:
print(df_clean['MultipleLines'].unique())
print(df_clean['OnlineSecurity'].unique())

['No phone service' 'No' 'Yes']
['No' 'Yes' 'No internet service']


In [7]:
# Columns with "No internet service" or "No phone service"
service_cols = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

# Replace "No internet service" and "No phone service" with "No"
for col in service_cols:
    df_clean[col] = df_clean[col].replace({'No internet service': 'No', 'No phone service': 'No'})

# Now convert to 1/0
for col in service_cols:
    df_clean[col] = df_clean[col].map({'Yes': 1, 'No': 0})

print("Service columns encoded!")
print(df_clean[service_cols].head())

Service columns encoded!
   MultipleLines  OnlineSecurity  OnlineBackup  DeviceProtection  TechSupport  \
0              0               0             1                 0            0   
1              0               1             0                 1            0   
2              0               1             1                 0            0   
3              0               1             0                 1            1   
4              0               0             0                 0            0   

   StreamingTV  StreamingMovies  
0            0                0  
1            0                0  
2            0                0  
3            0                0  
4            0                0  


In [8]:
# One-hot encode the remaining categorical columns
df_encoded = pd.get_dummies(df_clean, columns=['InternetService', 'Contract', 'PaymentMethod'], 
                            drop_first=True)

print(f"Shape before encoding: {df_clean.shape}")
print(f"Shape after encoding: {df_encoded.shape}")
print(f"\nNew columns created:")
print(df_encoded.columns.tolist())

Shape before encoding: (7043, 20)
Shape after encoding: (7043, 24)

New columns created:
['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'Churn', 'InternetService_Fiber optic', 'InternetService_No', 'Contract_One year', 'Contract_Two year', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']


In [None]:
# Create new features
df_encoded['avg_monthly_cost'] = df_encoded['TotalCharges'] / (df_encoded['tenure'] + 1)  # +1 to avoid divide by zero
df_encoded['is_new_customer'] = (df_encoded['tenure'] <= 12).astype(int)  # First year customers
df_encoded['high_value_customer'] = (df_encoded['MonthlyCharges'] > 70).astype(int)  # Premium customers to target

print("New features created!")
print(f"New shape: {df_encoded.shape}")
print("\nSample of new features:")
print(df_encoded[['tenure', 'MonthlyCharges', 'TotalCharges', 'avg_monthly_cost', 
                  'is_new_customer', 'high_value_customer']].head())

New features created!
New shape: (7043, 27)

Sample of new features:
   tenure  MonthlyCharges  TotalCharges  avg_monthly_cost  is_new_customer  \
0       1           29.85         29.85         14.925000                1   
1      34           56.95       1889.50         53.985714                0   
2       2           53.85        108.15         36.050000                1   
3      45           42.30       1840.75         40.016304                0   
4       2           70.70        151.65         50.550000                1   

   high_value_customer  
0                    0  
1                    0  
2                    0  
3                    0  
4                    1  


In [10]:
# Separate features and target
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

# Split into train and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nChurn distribution in training set:")
print(y_train.value_counts(normalize=True))

Training set: (5634, 26)
Test set: (1409, 26)

Churn distribution in training set:
Churn
0    0.734647
1    0.265353
Name: proportion, dtype: float64


In [11]:
# Scale numeric features
scaler = StandardScaler()

# Only scale these columns 
cols_to_scale = ['tenure', 'MonthlyCharges', 'TotalCharges', 'avg_monthly_cost']

# Fit on training data, transform both train and test
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

print("Features scaled!")
print("\nSample of scaled values:")
print(X_train[cols_to_scale].head())

Features scaled!

Sample of scaled values:
        tenure  MonthlyCharges  TotalCharges  avg_monthly_cost
3738  0.102371       -0.521976     -0.262257         -0.389837
3151 -0.711743        0.337478     -0.503635          0.416821
4860 -0.793155       -0.809013     -0.749883         -0.556374
3867 -0.263980        0.284384     -0.172722          0.371421
3810 -1.281624       -0.676279     -0.989374         -1.205939


In [13]:
# Save preprocessed data for modeling
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

print("Preprocessed data saved!")

Preprocessed data saved!


In [14]:
import pickle
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)
print("Scaler saved!")

Scaler saved!
