## Import Required Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib

## Load the Semi-Cleaned Dataset

In [2]:
# Load the Semi-Cleaned Dataset
df= pd.read_csv('../data/semi_cleaned.csv')

# Display the first 5 rows of the data set
print("First 5 rows of the Dataset: ")
print(df.head())
print("-"*80)

# Get a concise summary of the DataFrame
print("\nDataset Info: ")
df.info()
print("-"*80)

First 5 rows of the Dataset: 
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV Stream

## Split the Dataset Into Features and Target Variables

In [3]:
# Drop the Unique Identifier Column
df.drop('customerID', axis=1, inplace=True)

# Seperate features (X) and Target (y)
X= df.drop('Churn', axis=1)
y= df['Churn']

# Identify the Categorical and Numerical Columns
categorical_features= X.select_dtypes(include=object).columns.tolist()+['SeniorCitizen']
numerical_features= X.select_dtypes(exclude=object).columns.drop('SeniorCitizen').tolist()

print("Categorical Features: ") 
print(categorical_features)
print("-"*50)
print("Numerical Features: ")
print(numerical_features)


Categorical Features: 
['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'SeniorCitizen']
--------------------------------------------------
Numerical Features: 
['tenure', 'MonthlyCharges', 'TotalCharges']


## Splitting the Dataset

In [4]:
# Split Data into Training set and Test set
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")


X_train shape: (5625, 19)
y_train shape: (5625,)
X_test shape: (1407, 19)
y_test shape: (1407,)


## One-Hot Encoding Categorical Features

In [5]:
# Create an instance and fit the encoder
ohe= OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe.fit(X_train[categorical_features])

# Transform Training and Testing Data
X_train_encoded= ohe.transform(X_train[categorical_features])
X_test_encoded= ohe.transform(X_test[categorical_features])

# Convert the encoded arrays back into DataFrame
encoded_features= ohe.get_feature_names_out(categorical_features)
X_train_cat= pd.DataFrame(X_train_encoded, columns=encoded_features, index=X_train.index)
X_test_cat= pd.DataFrame(X_test_encoded, columns=encoded_features, index=X_test.index) 

print("Categorical Features encoded successfully!")

Categorical Features encoded successfully!


## Feature Scaling Numerical Data

In [7]:
# Create an instance and fit the scaler
scaler= StandardScaler()
scaler.fit(X_train[numerical_features])

# Transform Training and Testing Data
X_train_scaled= scaler.transform(X_train[numerical_features])
X_test_scaled= scaler.transform(X_test[numerical_features])

# Convert the scaled arrays back into DataFrame
X_train_num= pd.DataFrame(X_train_scaled, columns=numerical_features, index=X_train.index)
X_test_num= pd.DataFrame(X_test_scaled, columns=numerical_features, index=X_test.index)

print("Numerical Features encoded successfully!")

Numerical Features encoded successfully!


## Combine Pre-Processed Data

In [8]:
# Combine Processed DataFrames
X_train_processed= pd.concat([X_train_cat, X_train_num], axis=1)
X_test_processed= pd.concat([X_test_cat, X_test_num], axis=1)

print(f"Final Processed training data shape: {X_train_processed.shape}")
print(f"Final Processed testing data shape: {X_test_processed.shape}")

# Display the first 5 rows of the final training data
print("\nFirst 5 rows of the Final Processed Training Data: ")
display(X_train_processed.head())

Final Processed training data shape: (5625, 46)
Final Processed testing data shape: (1407, 46)

First 5 rows of the Final Processed Training Data: 


Unnamed: 0,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,...,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen_0,SeniorCitizen_1,tenure,MonthlyCharges,TotalCharges
1408,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.321816,0.981556,1.6599
6992,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,-0.26741,-0.971546,-0.562252
3349,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.444064,0.837066,1.756104
4486,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,-1.204646,0.641092,-0.908326
3535,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.669826,-0.808787,-0.101561


## Saving Processed Data and Pre-Processors

In [9]:
# Save Processed DataFrames to CSV
X_train_processed.to_csv('../data/X_train_processed.csv', index=False)
X_test_processed.to_csv('../data/X_test_processed.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)

print("Processed data saved successfully!")

# Save the fitted Preprocessors
joblib.dump(scaler, '../models/scaler.joblib')
joblib.dump(ohe, '../models/encoder.joblib')

print("Fitted Preprocessors saved successfully!")


Processed data saved successfully!
Fitted Preprocessors saved successfully!
