### Importing libraries and data

In [7]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

In [8]:
print("Source:", "/")

# reading the data from a csv file
data = pd.read_csv('../../data/day_3/telco-customer-churn/train.csv')

data

Source: /


Unnamed: 0,Age,Avg Monthly GB Download,Avg Monthly Long Distance Charges,Churn Category,Churn Reason,Churn Score,City,CLTV,Contract,Country,...,Tenure in Months,Total Charges,Total Extra Data Charges,Total Long Distance Charges,Total Refunds,Total Revenue,Under 30,Unlimited Data,Zip Code,Churn
0,72,4,19.44,,,51,San Mateo,4849,Two Year,United States,...,25,2191.15,0,486.00,0.00,2677.15,0,1,94403,0
1,27,59,45.62,,,27,Sutter Creek,3715,Month-to-Month,United States,...,35,3418.20,0,1596.70,0.00,5014.90,1,1,95685,0
2,59,0,16.07,,,59,Santa Cruz,5092,Month-to-Month,United States,...,46,851.20,0,739.22,0.00,1590.42,0,0,95064,0
3,25,27,0.00,,,49,Brea,2068,One Year,United States,...,27,1246.40,30,0.00,0.00,1276.40,1,0,92823,0
4,31,21,17.22,Dissatisfaction,Network reliability,88,San Jose,4026,One Year,United States,...,58,3563.80,0,998.76,0.00,4562.56,0,1,95117,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4220,36,0,7.76,,,52,Long Beach,4590,Month-to-Month,United States,...,1,19.90,0,7.76,0.00,27.66,0,0,90806,0
4221,77,22,23.43,,,34,San Francisco,5602,One Year,United States,...,22,1820.90,0,515.46,29.88,2306.48,0,1,94127,0
4222,56,0,28.06,,,63,Olivehurst,4458,One Year,United States,...,18,345.90,0,505.08,0.00,850.98,0,0,95961,0
4223,45,22,0.00,,,38,Westlake Village,4115,Month-to-Month,United States,...,57,3437.45,0,0.00,0.00,3437.45,0,1,91361,0


### Train-Test split

In [4]:
# drop the "survived" column from the data
input_data = data.drop(columns=["churn"])

# define the target variable separately
target_variable = data["survived"]

# perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(input_data, target_variable, test_size=0.2, random_state=42)

# show the length of the train set
init_length_X_train = len(X_train)
print("Train set size:", init_length_X_train)

Train set size: 712


### Data cleaning

In [5]:
# check the data types of the columns
print("Spalte, Eingelesener datentyp", "\n")

object_cols = []
for col in X_train.columns:
    print(f"{col}, {X_train[col].dtype}")
    if X_train[col].dtype == "object":
        object_cols.append(col)
        
print("\n", "Möglicherweise fehlerhafte Spalten:", object_cols)

Spalte, Eingelesener datentyp 

pclass, int64
sex, object
age, float64
sibsp, int64
parch, int64
fare, float64
embarked, object
class, object
who, object
adult_male, bool
deck, object
embark_town, object
alone, bool

 Möglicherweise fehlerhafte Spalten: ['sex', 'embarked', 'class', 'who', 'deck', 'embark_town']


In [6]:
# show the unique values in each column marked as object type
for col in object_cols:
    print(col, X_train[col].unique())

sex ['male' 'female']
embarked ['S' 'C' 'Q' nan]
class ['First' 'Second' 'Third']
who ['man' 'child' 'woman']
deck ['C' nan 'B' 'F' 'D' 'E' 'A' 'G']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' nan]


In [7]:
# print the number of missing values in each column
missing_values = X_train.isnull().sum()
print(missing_values, "\n")

pclass           0
sex              0
age            140
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           554
embark_town      2
alone            0
dtype: int64 



In [8]:
# drop the columns with less than X% missing values

# define the threshold and calculate the absolute threshold number
drop_below_relative = 0.01
drop_below_absolute = drop_below_relative * len(X_train)

# find out which columns have a number missing values below the threshold
drop_cols = missing_values[(missing_values > 0) & (missing_values < drop_below_absolute)].index.tolist()
print("The rows with missing values of the following columns will be dropped:", "\n", drop_cols, "\n\n")

# find out which columns have a number missing values above the threshold
not_drop_cols = missing_values[(missing_values > 0) & (missing_values > drop_below_absolute)].index.tolist()
print("The rows with missing values of the following columns will be NOT dropped:", "\n", not_drop_cols)

The rows with missing values of the following columns will be dropped: 
 ['embarked', 'embark_town'] 


The rows with missing values of the following columns will be NOT dropped: 
 ['age', 'deck']


In [9]:
# save the rows to drop in a set to prevent duplicates
rows_to_drop = set()

# get the index of rows with missing values in these columns
for col in drop_cols:
    nan_rows = X_train[X_train[col].isnull()].index
    rows_to_drop.update(nan_rows)

# convert the set to a list
rows_to_drop = list(rows_to_drop)

# drop the rows from the dataset
X_train.drop(rows_to_drop, inplace=True)
X_train.reset_index(drop=True, inplace=True)

# drop the rows from the target variable
y_train.drop(rows_to_drop, inplace=True)
y_train.reset_index(drop=True, inplace=True)

print("Dropped", init_length_X_train-len(X_train), "rows with missing values")

Dropped 2 rows with missing values


In [10]:
# impute the remaining missing values

# imputation by random sampling from the correct data of the column with missing data
for col in not_drop_cols:
    missing_mask = X_train[col].isnull()
    num_imputed_values = missing_mask.sum()

    sampled_values = X_train.loc[~missing_mask, col].sample(
        n=num_imputed_values, 
        replace=True, 
        random_state=42
    ).values

    X_train.loc[missing_mask, col] = sampled_values
    
    print(f"Imputed {num_imputed_values} values in column '{col}'", "\n")

Imputed 140 values in column 'age' 

Imputed 554 values in column 'deck' 



In [11]:
# remove redundance

# get the number of duplicate rows in the dataset
num_duplicate_rows = X_train.duplicated().sum()
print(f"Number of identical rows in the dataset: {num_duplicate_rows}", "\n")

print("Are those correct or incorrect duplicates?", "\n")

# remove redundant columns
X_train.drop(columns=["embark_town"], inplace=True)
print("Dropped 'embark_town' column as it is redundant to 'embarked' column")

Number of identical rows in the dataset: 16 

Are those correct or incorrect duplicates? 

Dropped 'embark_town' column as it is redundant to 'embarked' column


### Encodings

In [12]:
# ordinal encoding - with order

# define columns and mapping for the ordinal encoding
ordinal_cols = ['class']
ordinal_map = [['Third', 'Second', 'First']]

# initialize the OrdinalEncoder with the specified categories
ordinal_encoder = OrdinalEncoder(categories=ordinal_map)
X_train[ordinal_cols] = ordinal_encoder.fit_transform(X_train[ordinal_cols])

X_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,alone
0,1,male,45.5,0,0,28.5000,S,2.0,man,True,C,True
1,2,male,23.0,0,0,13.0000,S,1.0,man,True,D,True
2,3,male,32.0,0,0,7.9250,S,0.0,man,True,C,True
3,3,male,26.0,1,0,7.8542,S,0.0,man,True,F,False
4,3,female,6.0,4,2,31.2750,S,0.0,child,False,A,False
...,...,...,...,...,...,...,...,...,...,...,...,...
705,3,female,21.0,0,0,7.6500,S,0.0,woman,False,C,True
706,1,male,22.0,0,0,31.0000,S,2.0,man,True,B,True
707,3,male,41.0,2,0,14.1083,S,0.0,man,True,E,False
708,1,female,14.0,1,2,120.0000,S,2.0,child,False,B,False


In [13]:
# nominal encoding - without order (One-Hot Encoding)

# define columns and for the nominal encoding
nominal_cols = ['sex', 'embarked', 'who', 'deck']

# use pandas inbuilt functions to one-hot encode the nominal columns
X_train = pd.get_dummies(X_train, columns=nominal_cols, drop_first=True)

X_train

Unnamed: 0,pclass,age,sibsp,parch,fare,class,adult_male,alone,sex_male,embarked_Q,embarked_S,who_man,who_woman,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G
0,1,45.5,0,0,28.5000,2.0,True,True,True,False,True,True,False,False,True,False,False,False,False
1,2,23.0,0,0,13.0000,1.0,True,True,True,False,True,True,False,False,False,True,False,False,False
2,3,32.0,0,0,7.9250,0.0,True,True,True,False,True,True,False,False,True,False,False,False,False
3,3,26.0,1,0,7.8542,0.0,True,False,True,False,True,True,False,False,False,False,False,True,False
4,3,6.0,4,2,31.2750,0.0,False,False,False,False,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705,3,21.0,0,0,7.6500,0.0,False,True,False,False,True,False,True,False,True,False,False,False,False
706,1,22.0,0,0,31.0000,2.0,True,True,True,False,True,True,False,True,False,False,False,False,False
707,3,41.0,2,0,14.1083,0.0,True,False,True,False,True,True,False,False,False,False,True,False,False
708,1,14.0,1,2,120.0000,2.0,False,False,False,False,True,False,False,True,False,False,False,False,False


### Scaling

In [14]:
# convert the data set to float64
X_train = X_train.astype('float64')

# initialise the scaler
scaler = StandardScaler()

# list of columns to be scaled
columns_to_scale = ['age', 'fare']

# fit & transform the columns
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])

X_train

Unnamed: 0,pclass,age,sibsp,parch,fare,class,adult_male,alone,sex_male,embarked_Q,embarked_S,who_man,who_woman,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G
0,1.0,1.115354,0.0,0.0,-0.076094,2.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2.0,-0.447176,0.0,0.0,-0.374487,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3.0,0.177836,0.0,0.0,-0.472186,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,3.0,-0.238839,1.0,0.0,-0.473549,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3.0,-1.627755,4.0,2.0,-0.022672,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705,3.0,-0.586068,0.0,0.0,-0.477480,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
706,1.0,-0.516622,0.0,0.0,-0.027966,2.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
707,3.0,0.802848,2.0,0.0,-0.353151,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
708,1.0,-1.072188,1.0,2.0,1.685383,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


### Feature engineering

In [15]:
# the familiy size could be an indicator of survival
X_train["family_size"] = X_train["sibsp"] + X_train["parch"] + 1

# a combination of "age" and "pclass" could be an indicator of status and therefore survival
X_train["age_class_interaction"] = X_train["age"] * X_train["pclass"]

X_train

Unnamed: 0,pclass,age,sibsp,parch,fare,class,adult_male,alone,sex_male,embarked_Q,...,who_man,who_woman,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,family_size,age_class_interaction
0,1.0,1.115354,0.0,0.0,-0.076094,2.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.115354
1,2.0,-0.447176,0.0,0.0,-0.374487,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.894352
2,3.0,0.177836,0.0,0.0,-0.472186,0.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.533508
3,3.0,-0.238839,1.0,0.0,-0.473549,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,-0.716516
4,3.0,-1.627755,4.0,2.0,-0.022672,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,-4.883264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705,3.0,-0.586068,0.0,0.0,-0.477480,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-1.758203
706,1.0,-0.516622,0.0,0.0,-0.027966,2.0,1.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.516622
707,3.0,0.802848,2.0,0.0,-0.353151,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,2.408545
708,1.0,-1.072188,1.0,2.0,1.685383,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,-1.072188


### Processing the test data

We need to process the test data in a similar way we processeds the training data.

Otherwise the model will miss some of the artificial columns and values we created.

We have to be very careful to not leak any information from the test set into our model, or from the training set into the test set.

In [16]:
# drop the redundant column
X_test.drop(columns=['embark_town'], inplace=True)

# impute missing values by random sampling ot the test data
for col in X_test.columns:
    if X_test[col].isnull:
        missing_mask_test = X_test[col].isnull()
        num_missing_test = missing_mask_test.sum()
        if num_missing_test > 0:
            sampled_values_test = X_test.loc[~missing_mask_test, col].sample(
                n=num_missing_test,
                replace=True,
                random_state=42
            ).values
            X_test.loc[missing_mask_test, col] = sampled_values_test

# ordinal encoding
X_test[ordinal_cols] = ordinal_encoder.transform(X_test[ordinal_cols])

# one-hot encoding
X_test = pd.get_dummies(X_test, columns=nominal_cols, drop_first=True)

# add missing dummy columns (if they are not present in the test data)
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0

# convert the test data to float64 and reset the indices
X_test = X_test.astype('float64')
X_test.reset_index(drop=True, inplace=True)

# scale the corresponding columns
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale]) # NO FIT-TRANSFORM HERE!

# add the engineered features to the test data
X_test["family_size"]  = X_test["sibsp"] + X_test["parch"] + 1
X_test["age_class_interaction"]  = X_test["age"] * X_test["pclass"]

X_test

Unnamed: 0,pclass,age,sibsp,parch,fare,class,adult_male,alone,sex_male,embarked_Q,...,who_man,who_woman,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,family_size,age_class_interaction
0,3.0,2.330656,1.0,1.0,-0.331252,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,6.991967
1,2.0,0.108390,0.0,0.0,-0.422614,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.216780
2,3.0,-0.655514,0.0,0.0,-0.472186,0.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-1.966541
3,2.0,-1.627755,0.0,1.0,0.010536,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,-3.255510
4,3.0,-1.072188,1.0,0.0,-0.408336,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,-3.216565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,3.0,-0.863851,0.0,0.0,-0.487587,0.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-2.591553
175,3.0,-0.447176,0.0,0.0,-0.485662,0.0,1.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.341529
176,3.0,0.594511,1.0,5.0,-0.020507,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,1.783532
177,2.0,-0.863851,0.0,0.0,-0.422614,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.727702
