In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_data = pd.read_csv('train.csv')
train = train_data.copy()

test_data = pd.read_csv('test.csv')
test = test_data.copy()

In [3]:
print(train.columns)
print( train.shape)

print(test.columns)
print( test.shape)

Index(['traveller_ID', 'ticket_class', 'name', 'sex', 'age',
       'Siblings_spouses', 'Parchil', 'TickNum', 'fare', 'cabin', 'embarked',
       'MedBoat', 'Survived'],
      dtype='object')
(916, 13)
Index(['traveller_ID', 'ticket_class', 'name', 'sex', 'age',
       'Siblings_spouses', 'Parchil', 'TickNum', 'fare', 'cabin', 'embarked',
       'MedBoat'],
      dtype='object')
(393, 12)


In [4]:
datasets = [train, test]
for dataset in datasets:
    train_desc_cat = datasets[0].describe(include =['object'])
    test_desc_cat = datasets[1].describe(include =['object'])
    train_desc_num = datasets[0].describe()
    test_desc_num = datasets[1].describe()


In [5]:
train_desc_num

Unnamed: 0,traveller_ID,ticket_class,age,Siblings_spouses,Parchil,fare,Survived
count,916.0,916.0,729.0,916.0,916.0,915.0,916.0
mean,656.248908,2.305677,29.102309,0.539301,0.386463,33.709221,0.361354
std,381.176191,0.841811,13.866954,1.082188,0.893933,52.840656,0.480655
min,0.0,1.0,0.1667,0.0,0.0,0.0,0.0
25%,328.25,2.0,21.0,0.0,0.0,7.925,0.0
50%,670.0,3.0,28.0,0.0,0.0,14.5,0.0
75%,974.25,3.0,37.0,1.0,0.0,31.275,1.0
max,1308.0,3.0,80.0,8.0,9.0,512.3292,1.0


In [6]:
test_desc_num

Unnamed: 0,traveller_ID,ticket_class,age,Siblings_spouses,Parchil,fare
count,393.0,393.0,317.0,393.0,393.0,393.0
mean,648.75827,2.26972,31.672187,0.40458,0.381679,32.332188
std,370.986997,0.829017,15.472486,0.934967,0.796594,49.199401
min,10.0,1.0,0.4167,0.0,0.0,0.0
25%,327.0,2.0,22.0,0.0,0.0,7.8958
50%,618.0,3.0,30.0,0.0,0.0,13.0
75%,986.0,3.0,40.5,1.0,0.0,31.275
max,1301.0,3.0,74.0,8.0,6.0,512.3292


In [7]:
train_desc_cat

Unnamed: 0,name,sex,TickNum,cabin,embarked,MedBoat
count,916,916,916,204,915,327
unique,915,2,694,146,3,24
top,"Connolly, Miss. Kate",male,CA. 2343,B57 B59 B63 B66,S,13
freq,2,592,9,5,638,30


In [8]:
test_desc_cat

Unnamed: 0,name,sex,TickNum,cabin,embarked,MedBoat
count,393,393,393,91,392,159
unique,393,2,345,76,3,23
top,"Sage, Mr. Douglas Bullen",male,110413,B96 B98,S,C
freq,1,251,3,2,276,16


# Handling Missing Values

In [9]:
for dataset in datasets:
    train_missing_val = datasets[0].isnull().sum()
    test_missing_val = datasets[1].isnull().sum()

In [10]:
train_missing_val

traveller_ID          0
ticket_class          0
name                  0
sex                   0
age                 187
Siblings_spouses      0
Parchil               0
TickNum               0
fare                  1
cabin               712
embarked              1
MedBoat             589
Survived              0
dtype: int64

In [11]:
#Numerical
train['age'].fillna(train['age'].mean(), inplace = True)
train['fare'].fillna(train['fare'].mean(), inplace = True)

#Categorical
train['embarked'].fillna('S', inplace = True)

#randomly input values from the missing data in the following variables 
train['MedBoat'] = train['MedBoat'].fillna(0)

cab_arr = train['cabin'][train['cabin'].notnull()].values  #extracts all values in the column that are not null
fill_cab = pd.Series(np.random.choice(cab_arr, 1712))
train['cabin'] = train['cabin'].fillna(fill_cab)

train.isnull().sum()

traveller_ID        0
ticket_class        0
name                0
sex                 0
age                 0
Siblings_spouses    0
Parchil             0
TickNum             0
fare                0
cabin               0
embarked            0
MedBoat             0
Survived            0
dtype: int64

In [12]:

test_missing_val

traveller_ID          0
ticket_class          0
name                  0
sex                   0
age                  76
Siblings_spouses      0
Parchil               0
TickNum               0
fare                  0
cabin               302
embarked              1
MedBoat             234
dtype: int64

In [13]:
#Numerical
test['age'].fillna(test['age'].mean(), inplace = True)

#Categorical
test['embarked'].fillna('S', inplace = True)

#randomly input values from the missing data in the following variables 

cab_arr_test = test['cabin'][test['cabin'].notnull()].values  #extracts all values in the column that are not null
fill_cab_test = pd.Series(np.random.choice(cab_arr_test, 402))
test['cabin'] = test['cabin'].fillna(fill_cab_test)

test['MedBoat'] = test['MedBoat'].fillna(0)

test.isnull().sum()

traveller_ID        0
ticket_class        0
name                0
sex                 0
age                 0
Siblings_spouses    0
Parchil             0
TickNum             0
fare                0
cabin               0
embarked            0
MedBoat             0
dtype: int64

##  Creating New Features

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 13 columns):
traveller_ID        916 non-null int64
ticket_class        916 non-null int64
name                916 non-null object
sex                 916 non-null object
age                 916 non-null float64
Siblings_spouses    916 non-null int64
Parchil             916 non-null int64
TickNum             916 non-null object
fare                916 non-null float64
cabin               916 non-null object
embarked            916 non-null object
MedBoat             916 non-null object
Survived            916 non-null int64
dtypes: float64(2), int64(5), object(6)
memory usage: 93.1+ KB


In [15]:
# Merge Siblings_spouses and Parchil
datasets = [train, test]
for dataset in datasets:
    dataset['relatives'] = dataset['Siblings_spouses'] + dataset['Parchil']
    dataset.loc[dataset['relatives'] > 0, 'alone'] = 'False'
    dataset.loc[dataset['relatives'] == 0, 'alone'] = 'True'

train = train.drop(['Siblings_spouses','Parchil'], axis = 1)
test = test.drop(['Siblings_spouses','Parchil'], axis = 1)

train['alone'].value_counts()

True     538
False    378
Name: alone, dtype: int64

In [16]:
train.columns

Index(['traveller_ID', 'ticket_class', 'name', 'sex', 'age', 'TickNum', 'fare',
       'cabin', 'embarked', 'MedBoat', 'Survived', 'relatives', 'alone'],
      dtype='object')

In [17]:
test.columns

Index(['traveller_ID', 'ticket_class', 'name', 'sex', 'age', 'TickNum', 'fare',
       'cabin', 'embarked', 'MedBoat', 'relatives', 'alone'],
      dtype='object')

In [18]:
import re
datasets = [train,test]
for dataset in datasets:
    dataset['deck'] = dataset['cabin'].apply(lambda x: re.match(r"[a-zA-Z]+", x).group()) #create a new column 'deck'
    
#drop the cabin feature along the column
train = train.drop(['cabin'], axis =1)
test = test.drop(['cabin'], axis =1)

In [19]:
datasets = [train,test]
for dataset in datasets:
    dataset['fare'] = dataset['fare'].astype(int)

In [20]:
#Build a new feature from the name feature called title

datasets = [train,test]
for dataset in datasets:
    dataset['title'] = dataset['name'].str.extract(r"([A-Za-z]+)\.",expand=False)
    dataset['title'] = dataset['title'].replace('Mlle','Miss')
    dataset['title'] = dataset['title'].replace('Ms','Miss')
    dataset['title'] = dataset['title'].replace('Mme','Mrs')
    dataset['title'] = dataset['title'].apply(lambda x: 'Mr' if x=='Mr' else('Mrs' if x=='Mrs' else('Master' if x=='Master' else('Miss' if x=='Miss' else ('Rare')))))

train = train.drop(['name'], axis=1)
test = test.drop(['name'], axis=1)

In [21]:
# replace all instances where value of MedBoat is given with 1 and others as 0 
datasets = [train,test]
for dataset in datasets:
    dataset['MedBoat'] = dataset['MedBoat'].apply(lambda x: 0 if x==0 else 1)
train.MedBoat.value_counts()

0    589
1    327
Name: MedBoat, dtype: int64

In [22]:
train.Survived.value_counts()

0    585
1    331
Name: Survived, dtype: int64

In [23]:
train.head(7)

Unnamed: 0,traveller_ID,ticket_class,sex,age,TickNum,fare,embarked,MedBoat,Survived,relatives,alone,deck,title
0,1214,3,male,29.102309,315037,8,S,0,0,0,True,B,Mr
1,677,3,male,26.0,349224,7,S,0,0,0,True,B,Mr
2,534,2,female,19.0,250655,26,S,1,1,0,True,B,Miss
3,1174,3,female,29.102309,CA. 2343,69,S,0,0,10,False,B,Miss
4,864,3,female,28.0,347086,7,S,0,0,0,True,B,Miss
5,895,3,female,1.0,347742,11,S,1,1,2,False,A,Miss
6,813,3,male,29.102309,SOTON/O.Q. 3101314,7,S,0,0,0,True,C,Mr


In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 13 columns):
traveller_ID    916 non-null int64
ticket_class    916 non-null int64
sex             916 non-null object
age             916 non-null float64
TickNum         916 non-null object
fare            916 non-null int32
embarked        916 non-null object
MedBoat         916 non-null int64
Survived        916 non-null int64
relatives       916 non-null int64
alone           916 non-null object
deck            916 non-null object
title           916 non-null object
dtypes: float64(1), int32(1), int64(5), object(6)
memory usage: 89.5+ KB


In [25]:
#creating categories for age
datasets = [train,test]
for dataset in datasets:
    dataset['age'] = dataset['age'].apply(lambda x: 'baby' if x <= 4 else('child' if (x>4 and x<=12) else('teen' if (x>12 and x<=19) else('middle aged' if (x>19 and x<=59) else 'aged'))))
    

In [26]:
#Creating a new feature fare per person
atasets = [train,test]
for dataset in datasets:
    dataset['fare per person'] = dataset['fare'] / (dataset['relatives'] + 1)
    dataset['fare per person'] = dataset['fare per person'].astype(int)

In [27]:
train.head()

Unnamed: 0,traveller_ID,ticket_class,sex,age,TickNum,fare,embarked,MedBoat,Survived,relatives,alone,deck,title,fare per person
0,1214,3,male,middle aged,315037,8,S,0,0,0,True,B,Mr,8
1,677,3,male,middle aged,349224,7,S,0,0,0,True,B,Mr,7
2,534,2,female,teen,250655,26,S,1,1,0,True,B,Miss,26
3,1174,3,female,middle aged,CA. 2343,69,S,0,0,10,False,B,Miss,6
4,864,3,female,middle aged,347086,7,S,0,0,0,True,B,Miss,7


In [28]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 14 columns):
traveller_ID       916 non-null int64
ticket_class       916 non-null int64
sex                916 non-null object
age                916 non-null object
TickNum            916 non-null object
fare               916 non-null int32
embarked           916 non-null object
MedBoat            916 non-null int64
Survived           916 non-null int64
relatives          916 non-null int64
alone              916 non-null object
deck               916 non-null object
title              916 non-null object
fare per person    916 non-null int32
dtypes: int32(2), int64(5), object(7)
memory usage: 93.1+ KB


In [29]:
train.to_csv('newTrain.csv')
test.to_csv('newTest.csv')

# Encoding of categorical variables


In [30]:
train.describe(include=['object'])

Unnamed: 0,sex,age,TickNum,embarked,alone,deck,title
count,916,916,916,916,916,916,916
unique,2,5,694,3,2,8,5
top,male,middle aged,CA. 2343,S,True,C,Mr
freq,592,735,9,639,538,282,530


In [31]:
# encode those with two unique categories
sex ={'male':0, 'female':1}
alone = {'False':0, 'True':1}

datasets = [train,test]
for dataset in datasets:
    dataset['sex'] = dataset['sex'].map(sex)
    dataset['alone'] = dataset['alone'].map(alone)

In [32]:
train_X = train.drop(['Survived','TickNum','traveller_ID'], axis=1)
Y = train['Survived']

test_X = test.copy()
test_passengerID = test_X['traveller_ID']
test_X=test.drop(['TickNum','traveller_ID'], axis=1)

print(test_X.shape)
print(train_X.shape)

(393, 11)
(916, 11)


In [33]:
print(train_X.shape)
print(test_X.shape)

(916, 11)
(393, 11)


train_X = train_X[:,1:]
print(train_X.shape)

In [34]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 14 columns):
traveller_ID       916 non-null int64
ticket_class       916 non-null int64
sex                916 non-null int64
age                916 non-null object
TickNum            916 non-null object
fare               916 non-null int32
embarked           916 non-null object
MedBoat            916 non-null int64
Survived           916 non-null int64
relatives          916 non-null int64
alone              916 non-null int64
deck               916 non-null object
title              916 non-null object
fare per person    916 non-null int32
dtypes: int32(2), int64(7), object(5)
memory usage: 93.1+ KB


In [35]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [36]:
train["deck"] = le.fit_transform(train["deck"])
train["title"] = le.fit_transform(train["title"])
train["age"] = le.fit_transform(train["age"])
train["TickNum"] = le.fit_transform(train["TickNum"])
train["embarked"] = le.fit_transform(train["embarked"])


In [37]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 14 columns):
traveller_ID       916 non-null int64
ticket_class       916 non-null int64
sex                916 non-null int64
age                916 non-null int32
TickNum            916 non-null int32
fare               916 non-null int32
embarked           916 non-null int32
MedBoat            916 non-null int64
Survived           916 non-null int64
relatives          916 non-null int64
alone              916 non-null int64
deck               916 non-null int32
title              916 non-null int32
fare per person    916 non-null int32
dtypes: int32(7), int64(7)
memory usage: 75.2 KB


In [38]:
test["deck"] = le.fit_transform(test["deck"])
test["title"] = le.fit_transform(test["title"])
test["age"] = le.fit_transform(test["age"])
test["TickNum"] = le.fit_transform(test["TickNum"])
test["embarked"] = le.fit_transform(test["embarked"])



In [39]:
X =train[['embarked', 'age','deck', 'title', 'ticket_class',"relatives","alone","MedBoat","fare per person","fare","sex", ]]
Y = train['Survived']
     

In [40]:
X.columns

Index(['embarked', 'age', 'deck', 'title', 'ticket_class', 'relatives',
       'alone', 'MedBoat', 'fare per person', 'fare', 'sex'],
      dtype='object')

In [41]:
test_X.columns

Index(['ticket_class', 'sex', 'age', 'fare', 'embarked', 'MedBoat',
       'relatives', 'alone', 'deck', 'title', 'fare per person'],
      dtype='object')

In [42]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393 entries, 0 to 392
Data columns (total 13 columns):
traveller_ID       393 non-null int64
ticket_class       393 non-null float64
sex                393 non-null int64
age                393 non-null int32
TickNum            393 non-null int32
fare               393 non-null int32
embarked           393 non-null int32
MedBoat            393 non-null int64
relatives          393 non-null float64
alone              393 non-null int64
deck               393 non-null int32
title              393 non-null int32
fare per person    393 non-null int32
dtypes: float64(2), int32(7), int64(4)
memory usage: 29.2 KB


# Normalising the independent Variable

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state =0)

In [44]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf = rf.fit(X_train,Y_train)

In [45]:
Y_pred = rf.predict(X_test)

In [46]:
from sklearn.metrics import f1_score, confusion_matrix

cm = confusion_matrix(Y_test, Y_pred)

f1= f1_score(Y_test, Y_pred)

In [47]:
cm

array([[170,   4],
       [  2,  99]], dtype=int64)

In [48]:
f1

0.9705882352941178

In [49]:
test_X["age"] = le.fit_transform(test_X["age"])
test_X["deck"] = le.fit_transform(test_X["deck"])
test_X["embarked"] = le.fit_transform(test_X["embarked"])
test_X["title"] = le.fit_transform(test_X["title"])

In [50]:
test_X.head()

Unnamed: 0,ticket_class,sex,age,fare,embarked,MedBoat,relatives,alone,deck,title,fare per person
0,3.0,0,3,7,2,0,0.0,1,2,2,7
1,3.0,0,3,15,0,1,2.0,0,1,2,5
2,3.0,0,3,7,2,0,0.0,1,3,2,7
3,3.0,0,3,8,2,0,0.0,1,2,2,8
4,3.0,0,3,7,1,0,0.0,1,2,2,7


In [51]:
test.head()

Unnamed: 0,traveller_ID,ticket_class,sex,age,TickNum,fare,embarked,MedBoat,relatives,alone,deck,title,fare per person
0,1148,3.0,0,3,332,7,2,0,0.0,1,2,2,7
1,1049,3.0,0,3,96,15,0,1,2.0,0,1,2,5
2,982,3.0,0,3,187,7,2,0,0.0,1,3,2,7
3,808,3.0,0,3,255,8,2,0,0.0,1,2,2,8
4,1195,3.0,0,3,229,7,1,0,0.0,1,2,2,7


In [52]:
Y_final = rf.predict(test_X)
Y_final.shape

(393,)

In [53]:
result = pd.DataFrame(data = Y_final,columns=['Survived'])
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393 entries, 0 to 392
Data columns (total 1 columns):
Survived    393 non-null int64
dtypes: int64(1)
memory usage: 3.1 KB


In [54]:
test_passengerID = pd.DataFrame(data = test_passengerID,columns=['traveller_ID'])
test_passengerID.head()

Unnamed: 0,traveller_ID
0,1148
1,1049
2,982
3,808
4,1195


In [55]:
submission = pd.concat([test_passengerID,result],axis=1)

In [56]:
submission.head()

Unnamed: 0,traveller_ID,Survived
0,1148,1
1,1049,0
2,982,1
3,808,1
4,1195,1


In [57]:
submission['Survived'].value_counts()

1    252
0    141
Name: Survived, dtype: int64