In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("Titanic-Dataset.csv")

In [41]:
df

Unnamed: 0,PassengerId,Name,Ticket,Fare,Embarked,Sex_female,Sex_male,Survived_0,Survived_1,Pclass_1,Pclass_2,Pclass_3,FamilySize,AgeGroup
0,1,"Braund, Mr. Owen Harris",A/5 21171,7.2500,S,0,1,1,0,0,0,1,2,Young Adult
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",PC 17599,71.2833,C,1,0,0,1,1,0,0,2,Adult
2,3,"Heikkinen, Miss. Laina",STON/O2. 3101282,7.9250,S,1,0,0,1,0,0,1,1,Young Adult
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",113803,53.1000,S,1,0,0,1,1,0,0,2,Young Adult
4,5,"Allen, Mr. William Henry",373450,8.0500,S,0,1,1,0,0,0,1,1,Young Adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,"Montvila, Rev. Juozas",211536,13.0000,S,0,1,1,0,0,1,0,1,Young Adult
887,888,"Graham, Miss. Margaret Edith",112053,30.0000,S,1,0,0,1,1,0,0,1,Young Adult
888,889,"Johnston, Miss. Catherine Helen ""Carrie""",W./C. 6607,23.4500,S,1,0,1,0,0,0,1,4,Young Adult
889,890,"Behr, Mr. Karl Howell",111369,30.0000,C,0,1,0,1,1,0,0,1,Young Adult


In [7]:
#Before moving on with the data reduction it is important to focus on the preprocessing of the data
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [11]:
# filling the missing value of age
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
df['Age'] = imputer.fit_transform(df[['Age']])

In [13]:
# Converting the age into the int type
df['Age'] = df['Age'].astype(int)

In [15]:
# Droping the cabin number as it is not important
df.drop(columns=['Cabin'],inplace= True)

In [17]:
most_frequent_embarked = df['Embarked'].value_counts().index[0]
df['Embarked'] = df['Embarked'].fillna(most_frequent_embarked)

In [19]:
# One hot encoding for the categorical attributes
df = pd.get_dummies(df,columns=['Sex'],dtype=int)

In [21]:
df = pd.get_dummies(df,columns=['Survived'],dtype=int)
df = pd.get_dummies(df,columns=['Pclass'],dtype=int)

In [23]:
# Feature Engineering for combining the two attribute like age SibSp and Parch
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1 # for 1 to add the passenger himself 

In [29]:
# Droping the Sibsp and the Parch 
df.drop(columns=['SibSp','Parch'],inplace= True)

In [33]:
# Binning of the Age
bins = [0,18,35,60,np.inf]
labels = ['Child','Young Adult','Adult','Senior']
df['AgeGroup'] = pd.cut(df['Age'],bins=bins,labels=labels)

In [39]:
# Droping the age as it is not needed
df.drop(columns=['Age'],inplace=True)

KeyError: "['Age'] not found in axis"

In [61]:
df

Unnamed: 0,Fare,Sex_female,Sex_male,Survived_0,Survived_1,Pclass_1,Pclass_2,Pclass_3,FamilySize,AgeGroup,EmbarkedRegion
0,-0.502445,0,1,1,0,0,0,1,2,Young Adult,Europe
1,0.786845,1,0,0,1,1,0,0,2,Adult,Europe
2,-0.488854,1,0,0,1,0,0,1,1,Young Adult,Europe
3,0.420730,1,0,0,1,1,0,0,2,Young Adult,Europe
4,-0.486337,0,1,1,0,0,0,1,1,Young Adult,Europe
...,...,...,...,...,...,...,...,...,...,...,...
886,-0.386671,0,1,1,0,0,1,0,1,Young Adult,Europe
887,-0.044381,1,0,0,1,1,0,0,1,Young Adult,Europe
888,-0.176263,1,0,1,0,0,0,1,4,Young Adult,Europe
889,-0.044381,0,1,0,1,1,0,0,1,Young Adult,Europe


In [49]:
#Concept Hierarchy Generation for Nominal Data
df['EmbarkedRegion'] = df['Embarked'].map({'C':'Europe','S':'Europe','Q':'Other'})
df.drop(columns=['Embarked'],inplace=True)

In [55]:
# Z-Score Standarziation for the fare
from sklearn.preprocessing import StandardScaler
z_scaler = StandardScaler()
df['Fare'] = z_scaler.fit_transform(df[['Fare']])

In [59]:
# Dimensionally Reduction by removing the unwanted features
# Unwanted Features: Ticket,Name,Passenger id
df.drop(columns=['Ticket','Name','PassengerId'],inplace=True)

In [65]:
# Using the sampling technique to gain the sub-set of the data set used in the data set
df_sampled = df.sample(frac=0.5,random_state=42)
df_sampled

Unnamed: 0,Fare,Sex_female,Sex_male,Survived_0,Survived_1,Pclass_1,Pclass_2,Pclass_3,FamilySize,AgeGroup,EmbarkedRegion
709,-0.341452,0,1,0,1,0,0,1,3,Young Adult,Europe
439,-0.437007,0,1,1,0,0,1,0,1,Young Adult,Europe
840,-0.488854,0,1,1,0,0,0,1,1,Young Adult,Europe
720,0.016023,1,0,0,1,0,1,0,2,Child,Europe
39,-0.422074,1,0,0,1,0,0,1,2,Child,Europe
...,...,...,...,...,...,...,...,...,...,...,...
825,-0.508486,0,1,1,0,0,0,1,1,Young Adult,Other
188,-0.336334,0,1,1,0,0,0,1,3,Adult,Other
271,-0.648422,0,1,0,1,0,0,1,1,Young Adult,Europe
662,-0.133225,0,1,1,0,1,0,0,1,Adult,Europe
