In [2]:
import numpy as np
import pandas as pd

from prefect import task,flow
from joblib import dump,load

## Load Processed Train Data

In [25]:
data = pd.read_csv("../data/train_processed_data.csv")
data.drop(columns=["Unnamed: 0"],inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5714 entries, 0 to 5713
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        5714 non-null   object 
 1   gender            5714 non-null   object 
 2   SeniorCitizen     5714 non-null   object 
 3   Partner           5714 non-null   object 
 4   Dependents        5714 non-null   object 
 5   tenure            5714 non-null   int64  
 6   PhoneService      5714 non-null   object 
 7   MultipleLines     5714 non-null   object 
 8   InternetService   5714 non-null   object 
 9   OnlineSecurity    5714 non-null   object 
 10  OnlineBackup      5714 non-null   object 
 11  DeviceProtection  5714 non-null   object 
 12  TechSupport       5714 non-null   object 
 13  StreamingTV       5714 non-null   object 
 14  StreamingMovies   5714 non-null   object 
 15  Contract          5714 non-null   object 
 16  PaperlessBilling  5714 non-null   object 


In [27]:
X,y = data.drop("Churn",axis=1),data["Churn"]

In [26]:
# Build Load Task
@task
def load_data(data):
    data = pd.DataFrame(data,columns=['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'])
    return data

## Feature Selection

**CustomerID** is an identity feature with no benefit for the task.<br>
So it will get dropped.

In [5]:
data.drop(columns=["customerID"],inplace=True)

## Feature Cleaning

In [6]:
data.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        7
Churn               0
dtype: int64

We see that **Total Charges** has
7 missing values.

In [7]:
from sklearn.impute import SimpleImputer

In [8]:
# Fill with Mode
mode_features = ["gender","SeniorCitizen","Partner","Dependents","PhoneService","MultipleLines","InternetService","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies","Contract","PaperlessBilling","PaymentMethod"]
mode_imputer = SimpleImputer(strategy="most_frequent")
mode_imputer.fit(data[mode_features])

dump(mode_imputer,"../pipelines/mode_imputer.joblib")

['../pipelines/mode_imputer.joblib']

In [9]:
# Fill with Mean
mean_features = ["tenure","MonthlyCharges","TotalCharges"]
mean_imputer = SimpleImputer(strategy="mean")
mean_imputer.fit(data[mean_features])

dump(mean_imputer,"../pipelines/mean_imputer.joblib")

['../pipelines/mean_imputer.joblib']

In [10]:
# Build The Clean Task
@task
def clean_data(org_data):
    data = org_data.copy()
    mode_features = ["gender","SeniorCitizen","Partner","Dependents","PhoneService","MultipleLines","InternetService","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies","Contract","PaperlessBilling","PaymentMethod"]
    mean_features = ["tenure","MonthlyCharges","TotalCharges"]

    mode_cleaned_data = pd.DataFrame(mode_imputer.transform(data[mode_features]),columns=mode_features)
    mean_cleaned_data = pd.DataFrame(mean_imputer.transform(data[mean_features]),columns=mean_features)

    return pd.concat([mode_cleaned_data,mean_cleaned_data],axis=1)

## Feature Engineering

Create **new features** from the **existing ones**.

In [11]:
# Build The Feature Engineering Task
@task
def feature_engineering(org_data):
    data = pd.DataFrame(org_data,columns=['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
   'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
   'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
   'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
   'MonthlyCharges', 'TotalCharges', 'Churn'])
        
    data['TenureCategory'] = pd.cut(data['tenure'], bins=[-1,3,12,np.inf],labels=["Recent", "Established", "Loyal"]) 
    data['Tenure_X_MonthlyCharges'] = data['tenure'] * data['MonthlyCharges']
    data['HighSpender'] = data['TotalCharges'].apply(lambda x: True if x > 5000 else False)
    data['MonthlyChargesRatio'] = np.where(data['TotalCharges']!=0,data['MonthlyCharges'] / data['TotalCharges'],0)
    data['SqrtTotalCharges'] = data['TotalCharges'].apply(np.sqrt)
    data['SqrtMonthlyCharges'] = data['MonthlyCharges'].apply(np.sqrt)
    data['AutomaticPayment'] = data['PaymentMethod'].apply(lambda x: True if "automatic" in x else False)
    data['AllServicesActivated'] =  (data['OnlineSecurity'] == "Yes") & (data['OnlineBackup'] == "Yes") & \
                                   (data['DeviceProtection'] == "Yes") & (data['TechSupport'] == "Yes")
    data['TeleServicesActivated'] = (data['PhoneService'] == "Yes") & (data['MultipleLines'] == "Yes") & \
                                    (data['InternetService'] == "Yes")
    data['Connected'] = (data['Partner'] == "Yes") & (data['Dependents'] == "Yes")
    
    return data

In [28]:
processed_data = feature_engineering(clean_data(load_data(X)))

## Numerical Feature Scaling

In [12]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()
scaler.fit(processed_data[["tenure","MonthlyCharges","TotalCharges","MonthlyChargesRatio","Tenure_X_MonthlyCharges","SqrtTotalCharges","SqrtMonthlyCharges"]])

dump(scaler,"../pipelines/scaler.joblib")

['../pipelines/scaler.joblib']

In [17]:
# Build The Scaling Task
num_features = ["tenure","MonthlyCharges","TotalCharges","MonthlyChargesRatio","Tenure_X_MonthlyCharges","SqrtTotalCharges","SqrtMonthlyCharges"]

@task
def scaling_data(data):
    data[num_features] = scaler.transform(data[num_features])

    return data

## Categorical Feature Encoding

In [18]:
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder

In [19]:
ordinal_features = ["SeniorCitizen","Partner","Dependents","PhoneService","MultipleLines","InternetService","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies","Contract","PaperlessBilling","TenureCategory","HighSpender","AutomaticPayment","AllServicesActivated","TeleServicesActivated","Connected"]

binary = [False,True]
binary_yes_no = ["No","Yes"]
multi = ["No internet service","No","Yes"]
custom_categories = [
    binary_yes_no,
    binary_yes_no,
    binary_yes_no,
    binary_yes_no,
    binary_yes_no,
    ["No","DSL","Fiber optic"],
    multi,
    multi,
    multi,
    multi,
    multi,
    multi,
    ["Month-to-month","One year","Two year"],
    binary_yes_no,
    ["Recent", "Established", "Loyal"],
    binary,
    binary,
    binary,
    binary,
    binary,
]
ord_enc = OrdinalEncoder(categories=custom_categories)
ord_enc.fit(processed_data[ordinal_features])

dump(ord_enc,"../pipelines/ord_enc.joblib")

['../pipelines/ord_enc.joblib']

In [20]:
nominal_features = ["gender","PaymentMethod"]
one_hot_enc = OneHotEncoder(sparse_output=False)
one_hot_enc.fit(processed_data[nominal_features])

dump(one_hot_enc,"../pipelines/one_hot_enc.joblib")

['../pipelines/one_hot_enc.joblib']

In [21]:
# Build The Encoding Task
@task
def encoding_data(org_data):
    data = org_data.copy()
    ordinal_features = ["SeniorCitizen","Partner","Dependents","PhoneService","MultipleLines","InternetService","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies","Contract","PaperlessBilling","TenureCategory","HighSpender","AutomaticPayment","AllServicesActivated","TeleServicesActivated","Connected"]
    nominal_features = ["gender","PaymentMethod"]

    data[ordinal_features] = ord_enc.transform(data[ordinal_features])
    
    one_hot_encoded_data = one_hot_enc.transform(data[nominal_features])
    one_hot_encoded_data_columns = one_hot_enc.get_feature_names_out(input_features=data[nominal_features].columns)
    one_hot_data = pd.DataFrame(one_hot_encoded_data,columns=one_hot_encoded_data_columns)
    
    return pd.concat([data,one_hot_data],axis=1).drop(nominal_features,axis=1)

## Build the Data Pipeline

In [22]:
@flow(name="Pipeline")
def pipeline(data):
    processed_data = load_data(data)
    processed_data = clean_data(processed_data)
    processed_data = feature_engineering(processed_data)
    processed_data = scaling_data(processed_data)
    processed_data = encoding_data(processed_data)
    return processed_data

In [31]:
processed_data = pd.concat([pipeline(X),y],axis=1)
processed_data

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,...,AllServicesActivated,TeleServicesActivated,Connected,gender_Female,gender_Male,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn
0,0.0,1.0,1.0,1.572690,1.0,1.0,2.0,2.0,2.0,2.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,No
1,0.0,0.0,0.0,0.717944,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,No
2,0.0,1.0,1.0,-0.340313,1.0,1.0,1.0,2.0,1.0,2.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,No
3,0.0,0.0,1.0,-0.136802,1.0,1.0,2.0,1.0,2.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,No
4,0.0,1.0,1.0,-0.381015,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5709,0.0,1.0,0.0,1.572690,1.0,1.0,2.0,1.0,2.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,No
5710,0.0,0.0,0.0,-0.869441,1.0,1.0,2.0,1.0,1.0,2.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,Yes
5711,0.0,1.0,0.0,-0.340313,1.0,0.0,2.0,1.0,2.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,Yes
5712,0.0,1.0,0.0,-0.340313,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,No


In [None]:
processed_data.to_csv("./data/train_processed_data.csv")