Load Dataset and display first few records


In [17]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Data.csv')

# Display first 5 records
print("First 5 records:")
print(df.head())

First 5 records:
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies    

Summarize dataset structure

In [18]:
print("\nDataset structure:")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print("\nData types:")
print(df.dtypes)
print("\nMissing values count:")
print(df.isnull().sum())


Dataset structure:
Number of rows: 7043
Number of columns: 21

Data types:
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

Missing values count:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV  

Identify missing values

In [19]:
print("\nMissing values distribution:")
print(df.isna().sum())


Missing values distribution:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


Impute missing values

In [21]:
# Convert 'TotalCharges' to numeric (handles empty strings as NaN)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Impute numerical missing values with median
total_charges_median = df['TotalCharges'].median()
df['TotalCharges'] = df['TotalCharges'].fillna(total_charges_median)

# For categorical columns (if any were missing):
# df['ColumnName'] = df['ColumnName'].fillna(df['ColumnName'].mode()[0])

Data Transformation
a. Categorical Encoding

In [22]:
# Binary encoding for Yes/No columns
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

# Convert 'SeniorCitizen' (0/1) to categorical for one-hot encoding
df['SeniorCitizen'] = df['SeniorCitizen'].map({0: 'No', 1: 'Yes'})

# One-hot encode multi-category variables
categorical_cols = ['gender', 'SeniorCitizen', 'MultipleLines', 'InternetService',
                   'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                   'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

Feature Scaling

In [23]:
from sklearn.preprocessing import StandardScaler

# Scale numerical features
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Drop 'customerID' (irrelevant for modeling)
df = df.drop('customerID', axis=1)

 Data Splitting

In [25]:
from sklearn.model_selection import train_test_split

# Split into features (X) and target (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

# 80% train, 20% test (stratified to preserve class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nFinal shapes:")
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")
print("\nClass distribution in y_train:")
print(y_train.value_counts(normalize=True))


Final shapes:
Training set: (5634, 30), Test set: (1409, 30)

Class distribution in y_train:
Churn
0    0.734647
1    0.265353
Name: proportion, dtype: float64
