# Preprocessing of the SyriaTel Customer Churn Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

data_path = "../data/raw/telecom_churn_dataset.csv"
df = pd.read_csv(data_path)

df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


## Formatting Column Names and Dropping Unnecessary Columns

Below, I am formatting the column names so that they are all following the same standard. I am also dropping `state` and `phone_number` as they are not relevant for predicting customer churn.

In [2]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

print(df.columns)

Index(['state', 'account_length', 'area_code', 'phone_number',
       'international_plan', 'voice_mail_plan', 'number_vmail_messages',
       'total_day_minutes', 'total_day_calls', 'total_day_charge',
       'total_eve_minutes', 'total_eve_calls', 'total_eve_charge',
       'total_night_minutes', 'total_night_calls', 'total_night_charge',
       'total_intl_minutes', 'total_intl_calls', 'total_intl_charge',
       'customer_service_calls', 'churn'],
      dtype='object')


In [3]:
df.drop(['phone_number', 'state'], axis=1, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   account_length          3333 non-null   int64  
 1   area_code               3333 non-null   int64  
 2   international_plan      3333 non-null   object 
 3   voice_mail_plan         3333 non-null   object 
 4   number_vmail_messages   3333 non-null   int64  
 5   total_day_minutes       3333 non-null   float64
 6   total_day_calls         3333 non-null   int64  
 7   total_day_charge        3333 non-null   float64
 8   total_eve_minutes       3333 non-null   float64
 9   total_eve_calls         3333 non-null   int64  
 10  total_eve_charge        3333 non-null   float64
 11  total_night_minutes     3333 non-null   float64
 12  total_night_calls       3333 non-null   int64  
 13  total_night_charge      3333 non-null   float64
 14  total_intl_minutes      3333 non-null   

# Encoded Categorical Variables

Below, I applied one-hot encoding to convert the categorical features (`international_plan` and `voice_mail_plan`) into numerical values, allowing them to be used in the model.

In [4]:
# label encoding the binary categorical variables
df['international_plan'] = df['international_plan'].map({'yes': 1, 'no': 0})
df['voice_mail_plan'] = df['voice_mail_plan'].map({'yes': 1, 'no': 0})

# verifying the encoding
print(df[['international_plan', 'voice_mail_plan']].head())

   international_plan  voice_mail_plan
0                   0                1
1                   0                1
2                   0                0
3                   1                0
4                   1                0


In [5]:
# verifying that the data types have changed
print(df.dtypes)

account_length              int64
area_code                   int64
international_plan          int64
voice_mail_plan             int64
number_vmail_messages       int64
total_day_minutes         float64
total_day_calls             int64
total_day_charge          float64
total_eve_minutes         float64
total_eve_calls             int64
total_eve_charge          float64
total_night_minutes       float64
total_night_calls           int64
total_night_charge        float64
total_intl_minutes        float64
total_intl_calls            int64
total_intl_charge         float64
customer_service_calls      int64
churn                        bool
dtype: object


# Class Imbalance Note

It has been noted that there is class imbalance present, this will be handled during the model training phase (either through oversampling, undersampling, or class weights).

# Feature Scaling and Data Splitting

Features were scaled using `StandardScaler` to ensure all variables contribute equally during model training. Additionally, the dataset was split into training and testing sets, with 80% of the data used for training and 20% for testing.

In [6]:
# separating features (X) and target (y)
X = df.drop('churn', axis=1)
y = df['churn']

# scaling the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

df_preprocessed = pd.concat([pd.DataFrame(X_scaled, columns=X.columns), y.reset_index(drop=True)], axis=1)
joblib.dump(df_preprocessed, '../data/processed/preprocessed_data.pkl')

['../data/processed/preprocessed_data.pkl']

In [7]:
# splitting the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [8]:
# saving the split data
joblib.dump(X_train, '../data/processed/X_train.pkl')
joblib.dump(X_test, '../data/processed/X_test.pkl')
joblib.dump(y_train, '../data/processed/y_train.pkl')
joblib.dump(y_test, '../data/processed/y_test.pkl')

# saving the scaled dataframe
joblib.dump(X_scaled, '../data/processed/X_scaled.pkl')

['../data/processed/X_scaled.pkl']

## Conclusion of Preprocessing

In this notebook, I have:
- Dropped unnecessary columns.
- Encoded categorical variables.
- Scaled the features.
- Split the dataset into training and testing sets.

I will now proceed with model building and evaluation, which will be performed in a separate notebook.