In [7]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('Telecom_Churn.csv')

# Display basic information about the dataset
print("Dataset Info:")
print("-" * 50)
df.info()

print("\nFirst few rows of the dataset:")
print("-" * 50)
df.head()

Dataset Info:
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 n

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [8]:
# Handle missing values in TotalCharges
print("Number of missing values before handling:")
print(df.isnull().sum())

# Convert TotalCharges to numeric and handle missing values using the recommended method
df = df.assign(TotalCharges=pd.to_numeric(df['TotalCharges'], errors='coerce'))
df = df.assign(TotalCharges=df['TotalCharges'].fillna(df['TotalCharges'].mean()))

print("\nNumber of missing values after handling:")
print(df.isnull().sum())

Number of missing values before handling:
customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

Number of missing values after handling:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
To

In [9]:
# Define binary and multi-class categorical columns
binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
multi_class_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
                   'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
                   'Contract', 'PaymentMethod']

# Apply manual binary encoding to binary columns (0 and 1)
for col in binary_cols:
    unique_values = df[col].unique()
    mapping = {val: idx for idx, val in enumerate(sorted(unique_values))}
    df[col + '_encoded'] = df[col].map(mapping)
    print(f"Binary encoding mapping for {col}:")
    print(mapping)

# Apply One-Hot Encoding using pandas get_dummies
df_encoded = pd.get_dummies(df, columns=multi_class_cols, prefix=multi_class_cols)

# Keep only encoded columns and numeric columns
numeric_cols = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
encoded_cols = [col for col in df_encoded.columns if col.endswith('_encoded') or 
                any(mcol in col for mcol in multi_class_cols)]
final_cols = numeric_cols + encoded_cols

df_final = df_encoded[final_cols]

# Save the processed dataset
df_final.to_csv('telco_encoded.csv', index=False)

print("\nShape of final encoded dataset:", df_final.shape)
print("\nColumns in the final encoded dataset:")
for col in df_final.columns:
    print(col)

Binary encoding mapping for gender:
{'Female': 0, 'Male': 1}
Binary encoding mapping for Partner:
{'No': 0, 'Yes': 1}
Binary encoding mapping for Dependents:
{'No': 0, 'Yes': 1}
Binary encoding mapping for PhoneService:
{'No': 0, 'Yes': 1}
Binary encoding mapping for PaperlessBilling:
{'No': 0, 'Yes': 1}
Binary encoding mapping for Churn:
{'No': 0, 'Yes': 1}
Binary encoding mapping for Churn:
{'No': 0, 'Yes': 1}

Shape of final encoded dataset: (7043, 41)

Columns in the final encoded dataset:
SeniorCitizen
tenure
MonthlyCharges
TotalCharges
gender_encoded
Partner_encoded
Dependents_encoded
PhoneService_encoded
PaperlessBilling_encoded
Churn_encoded
MultipleLines_No
MultipleLines_No phone service
MultipleLines_Yes
InternetService_DSL
InternetService_Fiber optic
InternetService_No
OnlineSecurity_No
OnlineSecurity_No internet service
OnlineSecurity_Yes
OnlineBackup_No
OnlineBackup_No internet service
OnlineBackup_Yes
DeviceProtection_No
DeviceProtection_No internet service
DeviceProtecti

# Data Preprocessing Summary

1. **Missing Values Handling**:
   - Found and handled 11 missing values in the 'TotalCharges' column
   - Filled missing values with the mean of 'TotalCharges'

2. **Binary Categorical Variables** (Label Encoded):
   - gender
   - Partner
   - Dependents
   - PhoneService
   - PaperlessBilling
   - Churn

3. **Multi-class Categorical Variables** (One-Hot Encoded):
   - MultipleLines
   - InternetService
   - OnlineSecurity
   - OnlineBackup
   - DeviceProtection
   - TechSupport
   - StreamingTV
   - StreamingMovies
   - Contract
   - PaymentMethod

4. **Numeric Variables** (Unchanged):
   - SeniorCitizen
   - tenure
   - MonthlyCharges
   - TotalCharges

The processed dataset has been saved as 'telco_encoded.csv'.