In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
titanic_train = pd.read_csv("../Data/train.csv")
titanic_test = pd.read_csv("../Data/test.csv")

In [4]:
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic_train.isnull().sum() / len(titanic_train) * 100

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

In [6]:
titanic_test.isnull().sum() / len(titanic_test) * 100

PassengerId     0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            20.574163
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.239234
Cabin          78.229665
Embarked        0.000000
dtype: float64

In [7]:
titanic_train.drop(['Cabin'] , inplace=True , axis = 1)

In [8]:
titanic_test.drop(['Cabin'] , inplace=True , axis = 1)

In [9]:
titanic_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')

In [10]:
titanic_train.interpolate(axis=0 , inplace=True)
titanic_test.interpolate(axis=0 , inplace=True)

In [11]:
titanic_train.columns[titanic_train.dtypes == object]

Index(['Name', 'Sex', 'Ticket', 'Embarked'], dtype='object')

In [12]:
titanic_train['Age'].isnull().sum()

0

In [13]:
titanic_train['Sex'].unique()

array(['male', 'female'], dtype=object)

In [14]:
titanic_train['Embarked'].unique()[-1]

nan

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
class PreprocessingTitanic : 
    def __init__(self , df : pd.DataFrame) -> None: 
        self.df = df
        self.df_cols = self.df.columns
        self.preprocessAge()
        self.preprocessUselessColumns()
        self.preprocessEmbarked()
        self.preprocessObjectToFloat()
        # return self.df

    def preprocessAge(self) -> None: 
        for i in range(len(self.df)) :
            if self.df['Sex'][i] == self.df['Sex'].unique()[0] : 
                self.df['Sex'][i] = 0
            else : 
                self.df['Sex'][i] = 1

    def preprocessUselessColumns(self) -> None :
        try : 
            self.df.drop(['Name' , 'Ticket' , 'PassengerId'] , inplace=True , axis = 1)
            self.df_cols = self.df.columns
        except KeyError as k :
            pass

    def preprocessEmbarked(self) -> None :
        imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        imputer.fit(self.df)
        self.df = pd.DataFrame(imputer.transform(self.df) , columns = self.df_cols)
        # self.df_cols = self.df.columns
        # print(self.df.head(5))
        # print("After Embark")
    
    def preprocessObjectToFloat(self) -> None:
        obj_cols = self.df.columns[self.df.dtypes == object]
        # print(obj_cols)
        self.df = self.df.apply(pd.to_numeric , errors='ignore')
        # print(self.df.head(5))
        obj_cols = self.df.columns[self.df.dtypes == object]
        # print(obj_cols)
        self.df = pd.get_dummies(data = self.df , columns=obj_cols , drop_first = True , dtype = float)
        # print(self.df.head(5))
        print(self.df.dtypes)
        



In [41]:
preprocessed_titanic_train = PreprocessingTitanic(titanic_train)
preprocessed_titanic_test = PreprocessingTitanic(titanic_test)

Survived        int64
Pclass          int64
Sex             int64
Age           float64
SibSp           int64
Parch           int64
Fare          float64
Embarked_Q    float64
Embarked_S    float64
dtype: object
Pclass          int64
Sex             int64
Age           float64
SibSp           int64
Parch           int64
Fare          float64
Embarked_Q    float64
Embarked_S    float64
dtype: object


In [40]:
preprocessed_titanic_train.df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.2500,0.0,1.0
1,1,1,1,38.0,1,0,71.2833,0.0,0.0
2,1,3,1,26.0,0,0,7.9250,0.0,1.0
3,1,1,1,35.0,1,0,53.1000,0.0,1.0
4,0,3,1,35.0,0,0,8.0500,0.0,1.0
...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,0.0,1.0
887,1,1,1,19.0,0,0,30.0000,0.0,1.0
888,0,3,1,22.5,1,2,23.4500,0.0,1.0
889,1,1,1,26.0,0,0,30.0000,0.0,0.0


In [46]:
preprocessed_titanic_train.df.to_csv('../Data/Preprocessed_Train.csv' , index=False)
preprocessed_titanic_test.df.to_csv('../Data/Preprocessed_Test.csv' , index=False)