In [111]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [112]:
pd.options.display.max_columns = None

In [113]:
raw_train_path = os.path.join('data','train.csv')
raw_train = pd.read_csv(raw_train_path)
raw_train_og = raw_train.copy()

In [114]:
raw_test_path = os.path.join('data','test.csv')
raw_test = pd.read_csv(raw_test_path)
raw_test_og = raw_test.copy()

In [115]:
raw_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [116]:
raw_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [117]:
y_train = raw_train['Survived']

In [118]:
raw_train.drop(columns=['PassengerId','Ticket'], inplace=True)

In [119]:
raw_train.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [120]:
raw_train.shape

(891, 10)

# Filling null Age values

In [121]:
from sklearn.base import BaseEstimator, TransformerMixin

In [122]:
class AgeCleanup(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        # Make a new empty dataframe.
        age_classification = pd.DataFrame()
        
        # Make 4 new columns for age classes based on passenger's name.
        age_classes = ['Master','Miss','Mr.','Mrs.']
        for class_ in age_classes:
            bool_indexer = X['Name'].str.contains(class_, regex=False)
            age_classification[class_]=bool_indexer
        
        # Add a new empty column for finalization.
        age_classification['age_class'] = ''
        
        # List of all columns
        age_classification_cols = age_classification.columns.tolist()
        
        # Fill the new age_class column
        for i in list(age_classification.index):
            for col in age_classification_cols:
                if age_classification.at[i,col]:
                    age_classification.at[i,'age_class'] = col
                    break
        
        # Replace the values in age_class columns to represent something more meaningful.
        age_classes_for_raw_train = {'Mr.':'male_married',
                                     'Mrs.':'female_married',
                                     'Master':'male_unmarried',
                                     'Miss':'female_unmarried',
                                     'age_class':'already_done_except_1'}
    
        age_classification.replace({'age_class':age_classes_for_raw_train}, inplace=True)
        
        # Add the new engineered column to the raw_train dataframe.
        X['age_classes'] = age_classification['age_class']
        
        # Filling the age nulls using a groupby from age_class
        X['Age'] = X.groupby('age_classes')['Age'].transform(lambda x: x.fillna(x.mean()))
        
        # Dropping the extra column
        X.drop(columns = ['age_classes'], inplace = True)
        
        return X

# Filling null cabin values

In [123]:
class CabinCleanup(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self 
    def transform(self,X,y=None):
        # Making a new dataframe for manipulation
        cabin_cleanup = X.copy()
        
        # Extracting the indices for further use
        not_null_cabin_index = cabin_cleanup['Cabin'].str[:1].dropna().index
        null_cabin_index = cabin_cleanup[cabin_cleanup['Cabin'].isnull()]['Pclass'].index
        
        # Extracting only the cabin_class in a new column
        cabin_cleanup['Cabin_letter_only'] = cabin_cleanup['Cabin'].str[:1]
        
        # Making a dictionary for probabilities of cabin classes
        stratified_pclass_cabin = cabin_cleanup.groupby('Pclass')['Cabin_letter_only'].value_counts(normalize=True).to_frame().T.to_dict('list')
        stratified_pclass_cabin_keys = list(stratified_pclass_cabin.keys())
        
        # Managing probability sum error because T class was deleted from the dictionary
        if (1,'T') in stratified_pclass_cabin_keys:
            err = stratified_pclass_cabin[(1,'T')][0]/5
            for k,v in stratified_pclass_cabin.items():
                if k[0] == 1:
                    v[0] += err
            
            del stratified_pclass_cabin[(1,'T')]
        
        # Making the classes and probabilites for filling of data.
        p1_cabin_classes = []
        p1_cabin_classes_prob = []
        p2_cabin_classes = []
        p2_cabin_classes_prob = []
        p3_cabin_classes = []
        p3_cabin_classes_prob = []
        
        for k,v in stratified_pclass_cabin.items():
            if k[0] == 1:
                p1_cabin_classes.append(k[1])
                p1_cabin_classes_prob.append(v[0])
            elif k[0] == 2:
                p2_cabin_classes.append(k[1])
                p2_cabin_classes_prob.append(v[0])
            else:
                p3_cabin_classes.append(k[1])
                p3_cabin_classes_prob.append(v[0])
        
        # Filling the data into places
        for i in null_cabin_index:
            if cabin_cleanup.at[i,'Pclass'] == 1:
                cabin_cleanup.at[i,'Cabin'] = np.random.choice(p1_cabin_classes, p = p1_cabin_classes_prob)
            elif cabin_cleanup.at[i,'Pclass'] == 2:
                cabin_cleanup.at[i,'Cabin'] = np.random.choice(p2_cabin_classes, p = p2_cabin_classes_prob)
            else:
                cabin_cleanup.at[i,'Cabin'] = np.random.choice(p3_cabin_classes, p = p3_cabin_classes_prob)
        
        # Deleting the cabin_letter_only column
        cabin_cleanup.drop(columns='Cabin_letter_only', inplace=True)
        
        X = cabin_cleanup
        
        return X

In [124]:
from sklearn.pipeline import Pipeline
cleanup_pipeline = Pipeline([('age_handle_null',AgeCleanup()),
                             ('cabin_cleanup',CabinCleanup())])
raw_train = cleanup_pipeline.fit_transform(raw_train)
raw_test = cleanup_pipeline.fit_transform(raw_test)

# Basic cleanup

In [125]:
raw_train.dropna(subset=['Embarked'], inplace=True)
raw_train.drop(columns='Name', inplace=True)
raw_train['Cabin'] = raw_train['Cabin'].str[:1]

In [126]:
raw_test.drop(columns=['PassengerId','Ticket'], inplace=True)
#raw_train.dropna(subset=['Embarked'], inplace=True)
raw_test.drop(columns='Name', inplace=True)
raw_test['Cabin'] = raw_test['Cabin'].str[:1]

In [127]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [128]:
class Encoding(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['Sex'] = X['Sex'].replace({'male':0, 'female':1})
        X['Cabin'] = label_encoder.fit_transform(X['Cabin'])
        embarked_encoded = pd.get_dummies(X['Embarked'], drop_first=True)
        X = X.join(embarked_encoded)
        X.drop(columns = 'Embarked', inplace=True)
        return X
        

In [129]:
encoding_pipeline = Pipeline([('encoding',Encoding())])
raw_train = encoding_pipeline.fit_transform(raw_train)
raw_test = encoding_pipeline.fit_transform(raw_test)

In [130]:
raw_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Q,S
0,0,3,0,22.0,1,0,7.25,5,0,1
1,1,1,1,38.0,1,0,71.2833,2,0,0
2,1,3,1,26.0,0,0,7.925,6,0,1
3,1,1,1,35.0,1,0,53.1,2,0,1
4,0,3,0,35.0,0,0,8.05,6,0,1


In [131]:
raw_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Q,S
0,3,0,34.5,0,0,7.8292,5,1,0
1,3,1,47.0,1,0,7.0,5,0,1
2,2,0,62.0,0,0,9.6875,3,1,0
3,3,0,27.0,0,0,8.6625,6,0,1
4,3,1,22.0,1,1,12.2875,6,0,1


In [132]:
raw_train.to_csv(os.path.join('data','train_processed.csv'), index=False)
raw_test.to_csv(os.path.join('data','test_processed.csv'), index=False)