# Assignment 1 : Titanic data preprocessing

### Problem statement

Access an open source dataset “Titanic”. 
Apply pre-processing techniques on the raw dataset

In [101]:
import numpy as np
import pandas as pd


In [102]:
train= pd.read_csv('train.csv')
test= pd.read_csv('test.csv')

In [121]:
print(len(train))
print(len(test))

891
418


#### Combining two Files 


In [103]:
train_len = len(train)
train_df = pd.concat([train, test], axis=0)
train_df = train_df.reset_index(drop=True)


In [122]:
print(len(train_df))

1309


In [104]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [105]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [106]:
train_df.shape

(1309, 12)

#### We drop ‘Cabin’ as it contains too many missing values. 

In [107]:
train_df=train_df.drop(['Cabin'], axis=1)

In [108]:
train_df.shape

(1309, 11)

#### We also drop 'Embarked' and 'Age' as they contains many Nan values. So, replacing them  with their modes 


In [109]:
#modes is the value occuring repeatedly in given set.

In [110]:
#replacing age and embarked with mode of respective cols 

print(train_df.isnull().values.any())
print(train_df.shape)
modes={}
for eachcol in train_df.columns:
    mode_col= train_df[eachcol].mode()[0] #getting mode of column 
    modes[eachcol]= mode_col #storing mode of each feature in dictionary
    
    train_df[eachcol] = train_df[eachcol].replace(np.nan, mode_col)
print(train_df.isnull().values.any())
print(train_df.shape)

True
(1309, 11)
False
(1309, 11)


#### ‘Ticket’ feature is dropped too as we cannot assess anything based on this. 

In [None]:
train_df = train_df.drop(['Ticket'], axis=1)


In [112]:
train_df.shape


(1309, 10)

In [113]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


#### ‘Name’ is dropped too as it is relatively non-standard. It does not contribute directly to survival . 

In [None]:
train_df = train_df.drop(['Name'], axis=1)
train_df.shape

#### We see that there are two categorical features - 'Sex' and 'Embarked'. So, we need to perform label encoding to convert these categorical features to numerical features as this is a binary classication problem.

In [115]:
from sklearn import preprocessing
# label_encoder object 
label_encoder = preprocessing.LabelEncoder()
# Encode labels 
train_df['Sex']= label_encoder.fit_transform(train_df['Sex'])
train_df['Embarked']= label_encoder.fit_transform(train_df['Embarked'])

In [116]:
train_df.head

<bound method NDFrame.head of       PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch      Fare  \
0               1       0.0       3    1  22.0      1      0    7.2500   
1               2       1.0       1    0  38.0      1      0   71.2833   
2               3       1.0       3    0  26.0      0      0    7.9250   
3               4       1.0       1    0  35.0      1      0   53.1000   
4               5       0.0       3    1  35.0      0      0    8.0500   
...           ...       ...     ...  ...   ...    ...    ...       ...   
1304         1305       0.0       3    1  24.0      0      0    8.0500   
1305         1306       0.0       1    0  39.0      0      0  108.9000   
1306         1307       0.0       3    1  38.5      0      0    7.2500   
1307         1308       0.0       3    1  24.0      0      0    8.0500   
1308         1309       0.0       3    1  24.0      1      1   22.3583   

      Embarked  
0            2  
1            0  
2            2  
3            

In [117]:
print(train_df.dtypes)

PassengerId      int64
Survived       float64
Pclass           int64
Sex              int32
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked         int32
dtype: object


#### We will move the target variable, 'Survived,' to the end to prepare our data for training.

In [118]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]

In [119]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,1,22.0,1,0,7.25,2
1,2,1.0,1,0,38.0,1,0,71.2833,0
2,3,1.0,3,0,26.0,0,0,7.925,2
3,4,1.0,1,0,35.0,1,0,53.1,2
4,5,0.0,3,1,35.0,0,0,8.05,2


In [120]:
train_df.describe

<bound method NDFrame.describe of       PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch      Fare  \
0               1       0.0       3    1  22.0      1      0    7.2500   
1               2       1.0       1    0  38.0      1      0   71.2833   
2               3       1.0       3    0  26.0      0      0    7.9250   
3               4       1.0       1    0  35.0      1      0   53.1000   
4               5       0.0       3    1  35.0      0      0    8.0500   
...           ...       ...     ...  ...   ...    ...    ...       ...   
1304         1305       0.0       3    1  24.0      0      0    8.0500   
1305         1306       0.0       1    0  39.0      0      0  108.9000   
1306         1307       0.0       3    1  38.5      0      0    7.2500   
1307         1308       0.0       3    1  24.0      0      0    8.0500   
1308         1309       0.0       3    1  24.0      1      1   22.3583   

      Embarked  
0            2  
1            0  
2            2  
3        