# Assignment 1 : Titanic data preprocessing

### Problem statement

In [5]:
import numpy as np
import pandas as pd


In [6]:
train= pd.read_csv('train.csv')
test= pd.read_csv('test.csv')

In [7]:
print(len(train))
print(len(test))

891
418


#### Combining two Files 


In [8]:
train_len = len(train)
df = pd.concat([train, test], axis=0)
df = df.reset_index(drop=True)


In [9]:
print(len(df))

1309


In [10]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [12]:
df.shape

(1309, 12)

#### We drop ‘Cabin’ as it contains too many missing values. 

In [13]:
df=df.drop(['Cabin'], axis=1)

In [14]:
df.shape

(1309, 11)

#### We also drop 'Embarked' and 'Age' as they contains many Nan values. So, replacing them  with their modes 


In [15]:
#modes is the value occuring repeatedly in given set.

In [16]:
#replacing age and embarked with mode of respective cols 

print(df.isnull().values.any())
print(df.shape)
modes={}
for eachcol in df.columns:
    mode_col= df[eachcol].mode()[0] #getting mode of column 
    modes[eachcol]= mode_col #storing mode of each feature in dictionary
    
    df[eachcol] = df[eachcol].replace(np.nan, mode_col)
print(df.isnull().values.any())
print(df.shape)

True
(1309, 11)
False
(1309, 11)


#### ‘Ticket’ feature is dropped too as we cannot assess anything based on this. 

In [17]:
df = df.drop(['Ticket'], axis=1)

In [18]:
df.shape

(1309, 10)

In [19]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


#### ‘Name’ is dropped too as it is relatively non-standard. It does not contribute directly to survival . 

In [20]:
df = df.drop(['Name'], axis=1)
df.shape


(1309, 9)

#### We see that there are two categorical features - 'Sex' and 'Embarked'. So, we need to perform label encoding to convert these categorical features to numerical features as this is a binary classication problem.

In [21]:
from sklearn import preprocessing
# label_encoder object 
label_encoder = preprocessing.LabelEncoder()
# Encode labels 
df['Sex']= label_encoder.fit_transform(df['Sex'])
df['Embarked']= label_encoder.fit_transform(df['Embarked'])

In [22]:
df.head

<bound method NDFrame.head of       PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch      Fare  \
0               1       0.0       3    1  22.0      1      0    7.2500   
1               2       1.0       1    0  38.0      1      0   71.2833   
2               3       1.0       3    0  26.0      0      0    7.9250   
3               4       1.0       1    0  35.0      1      0   53.1000   
4               5       0.0       3    1  35.0      0      0    8.0500   
...           ...       ...     ...  ...   ...    ...    ...       ...   
1304         1305       0.0       3    1  24.0      0      0    8.0500   
1305         1306       0.0       1    0  39.0      0      0  108.9000   
1306         1307       0.0       3    1  38.5      0      0    7.2500   
1307         1308       0.0       3    1  24.0      0      0    8.0500   
1308         1309       0.0       3    1  24.0      1      1   22.3583   

      Embarked  
0            2  
1            0  
2            2  
3            

In [23]:
print(df.dtypes)

PassengerId      int64
Survived       float64
Pclass           int64
Sex              int32
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked         int32
dtype: object


#### We will move the target variable, 'Survived,' to the end to prepare our data for training.

In [24]:
X_train = df.drop("Survived", axis=1)
Y_train = df["Survived"]

In [25]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,1,22.0,1,0,7.25,2
1,2,1.0,1,0,38.0,1,0,71.2833,0
2,3,1.0,3,0,26.0,0,0,7.925,2
3,4,1.0,1,0,35.0,1,0,53.1,2
4,5,0.0,3,1,35.0,0,0,8.05,2


In [26]:
df.describe

<bound method NDFrame.describe of       PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch      Fare  \
0               1       0.0       3    1  22.0      1      0    7.2500   
1               2       1.0       1    0  38.0      1      0   71.2833   
2               3       1.0       3    0  26.0      0      0    7.9250   
3               4       1.0       1    0  35.0      1      0   53.1000   
4               5       0.0       3    1  35.0      0      0    8.0500   
...           ...       ...     ...  ...   ...    ...    ...       ...   
1304         1305       0.0       3    1  24.0      0      0    8.0500   
1305         1306       0.0       1    0  39.0      0      0  108.9000   
1306         1307       0.0       3    1  38.5      0      0    7.2500   
1307         1308       0.0       3    1  24.0      0      0    8.0500   
1308         1309       0.0       3    1  24.0      1      1   22.3583   

      Embarked  
0            2  
1            0  
2            2  
3        

# Assignment 2 : Spliting the preprocessed dataset in train and test

### Spliting the dataframe according to given number of rows in original dataset

In [46]:
print(len(df))

1309


In [30]:
train = df.iloc[:train_len :]
test = df.iloc[train_len: :]

In [31]:
print(len(train))

891


In [32]:
print(len(test))

418


In [33]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,1,22.0,1,0,7.25,2
1,2,1.0,1,0,38.0,1,0,71.2833,0
2,3,1.0,3,0,26.0,0,0,7.925,2
3,4,1.0,1,0,35.0,1,0,53.1,2
4,5,0.0,3,1,35.0,0,0,8.05,2
5,6,0.0,3,1,24.0,0,0,8.4583,1
6,7,0.0,1,1,54.0,0,0,51.8625,2
7,8,0.0,3,1,2.0,3,1,21.075,2
8,9,1.0,3,0,27.0,0,2,11.1333,2
9,10,1.0,2,0,14.0,1,0,30.0708,0


In [34]:
train.tail(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
881,882,0.0,3,1,33.0,0,0,7.8958,2
882,883,0.0,3,0,22.0,0,0,10.5167,2
883,884,0.0,2,1,28.0,0,0,10.5,2
884,885,0.0,3,1,25.0,0,0,7.05,2
885,886,0.0,3,0,39.0,0,5,29.125,1
886,887,0.0,2,1,27.0,0,0,13.0,2
887,888,1.0,1,0,19.0,0,0,30.0,2
888,889,0.0,3,0,24.0,1,2,23.45,2
889,890,1.0,1,1,26.0,0,0,30.0,0
890,891,0.0,3,1,32.0,0,0,7.75,1


In [35]:
test.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
891,892,0.0,3,1,34.5,0,0,7.8292,1
892,893,0.0,3,0,47.0,1,0,7.0,2
893,894,0.0,2,1,62.0,0,0,9.6875,1
894,895,0.0,3,1,27.0,0,0,8.6625,2
895,896,0.0,3,0,22.0,1,1,12.2875,2
896,897,0.0,3,1,14.0,0,0,9.225,2
897,898,0.0,3,0,30.0,0,0,7.6292,1
898,899,0.0,2,1,26.0,1,1,29.0,2
899,900,0.0,3,0,18.0,0,0,7.2292,0
900,901,0.0,3,1,21.0,2,0,24.15,2


In [36]:
test.tail(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1299,1300,0.0,3,0,24.0,0,0,7.7208,1
1300,1301,0.0,3,0,3.0,1,1,13.775,2
1301,1302,0.0,3,0,24.0,0,0,7.75,1
1302,1303,0.0,1,0,37.0,1,0,90.0,1
1303,1304,0.0,3,0,28.0,0,0,7.775,2
1304,1305,0.0,3,1,24.0,0,0,8.05,2
1305,1306,0.0,1,0,39.0,0,0,108.9,0
1306,1307,0.0,3,1,38.5,0,0,7.25,2
1307,1308,0.0,3,1,24.0,0,0,8.05,2
1308,1309,0.0,3,1,24.0,1,1,22.3583,0


### Train input split

In [43]:
X = train.drop(columns=['PassengerId', 'Survived'], axis=1)
Y = train['Survived']

In [44]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


In [45]:
Y.head()

0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Survived, dtype: float64