#IMPORTING ALL THE NECESSARY LIBRARIES

In [0]:
# linear algebra
import numpy as np 
import sklearn

# data processing
import pandas as pd 


# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing

### installing seaborn

In [0]:
!pip install -U seaborn

Requirement already up-to-date: seaborn in /usr/local/lib/python3.6/dist-packages (0.9.0)


### Loading train & test data

In [0]:
train_df1=pd.read_csv("train.csv")
test_df=pd.read_csv("test.csv")


In [0]:
print(train_df1.isnull().count())

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            891
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          891
Embarked       891
dtype: int64


### finding median

In [0]:
median= train_df1.median()
print(median)

PassengerId    446.0000
Survived         0.0000
Pclass           3.0000
Age             28.0000
SibSp            0.0000
Parch            0.0000
Fare            14.4542
dtype: float64


### checking no.of null values in the dataset

In [0]:
null_counts = train_df1.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

Cabin       687
Age         177
Embarked      2
dtype: int64

In [0]:
train_df1['Age'].fillna(train_df1['Age'].median(), inplace = True)

In [0]:
train_df1.drop(['Cabin'],axis = 1,inplace= True)
train_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(4)
memory usage: 76.6+ KB


In [0]:
train_df1['Embarked'].fillna(train_df1['Embarked'].mode()[0], inplace = True)

### label encoding

In [0]:
train_df1['Embarked_Code'] = sklearn.preprocessing.LabelEncoder().fit_transform(train_df1['Embarked'])

In [0]:
train_df1.drop('Embarked',axis=1,inplace=True)

### One Hot Encoding

In [0]:
from sklearn.preprocessing import OneHotEncoder
Y_train = np.asarray(train_df1[['Embarked_Code']])

x=OneHotEncoder(sparse=False).fit_transform(Y_train)
train_df1["S"]=x[:,0]
train_df1["Q"]=x[:,1]
train_df1["C"]=x[:,2]



In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


### Extracting the Title of each passenger

In [0]:
train_df1['Title'] = train_df1['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

In [0]:
min = 10
initials = (train_df1['Title'].value_counts() < min)

train_df1['Title'] = train_df1['Title'].apply(lambda x: 'Misc' if initials.loc[x] == True else x)
print(train_df1['Title'].value_counts())

Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64


In [0]:
port={"male":1,"female":0}
train_df1['Sex']=train_df1['Sex'].map(port)

In [0]:
port={"Mr":1,"Miss":0,"Mrs":2,"Master":4,"Misc":5}
train_df1['Title']=train_df1['Title'].map(port)

In [0]:
train_df1

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked_Code,S,Q,C,Title
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,2,0.0,0.0,1.0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,0,1.0,0.0,0.0,2
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,2,0.0,0.0,1.0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,2,0.0,0.0,1.0,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,2,0.0,0.0,1.0,1
5,6,0,3,"Moran, Mr. James",1,28.0,0,0,330877,8.4583,1,0.0,1.0,0.0,1
6,7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,2,0.0,0.0,1.0,1
7,8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,349909,21.0750,2,0.0,0.0,1.0,4
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0,27.0,0,2,347742,11.1333,2,0.0,0.0,1.0,2
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",0,14.0,1,0,237736,30.0708,0,1.0,0.0,0.0,2


In [0]:
from pandas import Series,DataFrame

In [0]:
train_df1.drop('Ticket',axis=1,inplace=True)

## Visualizing the dataset after Encoding is done.

In [0]:
train_df1

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked_Code,S,Q,C,Title
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,7.2500,2,0.0,0.0,1.0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833,0,1.0,0.0,0.0,2
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,7.9250,2,0.0,0.0,1.0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,53.1000,2,0.0,0.0,1.0,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,8.0500,2,0.0,0.0,1.0,1
5,6,0,3,"Moran, Mr. James",1,28.0,0,0,8.4583,1,0.0,1.0,0.0,1
6,7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,51.8625,2,0.0,0.0,1.0,1
7,8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,21.0750,2,0.0,0.0,1.0,4
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0,27.0,0,2,11.1333,2,0.0,0.0,1.0,2
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",0,14.0,1,0,30.0708,0,1.0,0.0,0.0,2


##Creating New Features

In [0]:
train_df1['AGE_SEX']=train_df1['Sex']*train_df1['Age']

In [0]:
train_df1

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked_Code,S,Q,C,Title,AGE_SEX
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,7.2500,2,0.0,0.0,1.0,1,22.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833,0,1.0,0.0,0.0,2,0.0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,7.9250,2,0.0,0.0,1.0,0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,53.1000,2,0.0,0.0,1.0,2,0.0
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,8.0500,2,0.0,0.0,1.0,1,35.0
5,6,0,3,"Moran, Mr. James",1,28.0,0,0,8.4583,1,0.0,1.0,0.0,1,28.0
6,7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,51.8625,2,0.0,0.0,1.0,1,54.0
7,8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,21.0750,2,0.0,0.0,1.0,4,2.0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0,27.0,0,2,11.1333,2,0.0,0.0,1.0,2,0.0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",0,14.0,1,0,30.0708,0,1.0,0.0,0.0,2,0.0


## Dropping Unecessary features

In [0]:
train_df1.drop('Name',axis=1,inplace=True)

In [0]:
train_df1['Sib_Parch']=train_df1['SibSp']*train_df1['Parch']

In [0]:
train_df1

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Code,S,Q,C,Title,AGE_SEX,Sib_Parch
0,1,0,3,1,22.0,1,0,7.2500,2,0.0,0.0,1.0,1,22.0,0
1,2,1,1,0,38.0,1,0,71.2833,0,1.0,0.0,0.0,2,0.0,0
2,3,1,3,0,26.0,0,0,7.9250,2,0.0,0.0,1.0,0,0.0,0
3,4,1,1,0,35.0,1,0,53.1000,2,0.0,0.0,1.0,2,0.0,0
4,5,0,3,1,35.0,0,0,8.0500,2,0.0,0.0,1.0,1,35.0,0
5,6,0,3,1,28.0,0,0,8.4583,1,0.0,1.0,0.0,1,28.0,0
6,7,0,1,1,54.0,0,0,51.8625,2,0.0,0.0,1.0,1,54.0,0
7,8,0,3,1,2.0,3,1,21.0750,2,0.0,0.0,1.0,4,2.0,3
8,9,1,3,0,27.0,0,2,11.1333,2,0.0,0.0,1.0,2,0.0,0
9,10,1,2,0,14.0,1,0,30.0708,0,1.0,0.0,0.0,2,0.0,0


In [0]:
train_df1['Pclass_fare']=train_df1['Pclass']*train_df1['Fare']

In [0]:
y_train1=train_df1["Survived"]
y_train1

0      0
1      1
2      1
3      1
4      0
5      0
6      0
7      0
8      1
9      1
10     1
11     1
12     0
13     0
14     0
15     1
16     0
17     1
18     0
19     1
20     0
21     1
22     1
23     1
24     0
25     1
26     0
27     0
28     1
29     0
      ..
861    0
862    1
863    0
864    0
865    1
866    1
867    0
868    0
869    1
870    0
871    1
872    0
873    0
874    1
875    1
876    0
877    0
878    0
879    1
880    1
881    0
882    0
883    0
884    0
885    0
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [0]:
X_train1=train_df1.drop('Survived',axis=1,inplace=True)
X_train1=train_df1

In [0]:
X_train1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 17 columns):
PassengerId      891 non-null int64
Pclass           891 non-null int64
Sex              891 non-null int64
Age              891 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Fare             891 non-null float64
Embarked_Code    891 non-null int64
S                891 non-null float64
Q                891 non-null float64
C                891 non-null float64
Title            891 non-null int64
AGE_SEX          891 non-null float64
Sib_Parch        891 non-null int64
Pclass_fare      891 non-null float64
A_S3             891 non-null float64
A_S4             891 non-null float64
dtypes: float64(9), int64(8)
memory usage: 118.4 KB


In [0]:
train_df1['A_S3']=(train_df1['Age']*train_df1['Age'])+(train_df1['Sex']*train_df1['Sex'])

In [0]:
train_df1['A_S4']=(train_df1['Age']*train_df1['Age'])+train_df1['Sex']

In [0]:
train_df1['A_S5']=(train_df1['AGE_SEX']*train_df1['Sex'])+train_df['Age']

In [0]:
train_df1.drop('A_S5',axis=1,inplace=True)

In [0]:
y_train1=train_df["Survived"]
y_train1.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [0]:
X_train1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 17 columns):
PassengerId      891 non-null int64
Pclass           891 non-null int64
Sex              891 non-null int64
Age              891 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Fare             891 non-null float64
Embarked_Code    891 non-null int64
S                891 non-null float64
Q                891 non-null float64
C                891 non-null float64
Title            891 non-null int64
AGE_SEX          891 non-null float64
Sib_Parch        891 non-null int64
Pclass_fare      891 non-null float64
A_S3             891 non-null float64
A_S4             891 non-null float64
dtypes: float64(9), int64(8)
memory usage: 118.4 KB


In [0]:
median= test_df.median()
print(median)

PassengerId    1100.5000
Pclass            3.0000
Age              27.0000
SibSp             0.0000
Parch             0.0000
Fare             14.4542
dtype: float64


In [0]:
null_counts = test_df.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

Cabin    327
Age       86
Fare       1
dtype: int64

In [0]:
test_df['Age'].fillna(test_df['Age'].median(), inplace = True)

In [0]:
test_df['Fare'].fillna(test_df['Fare'].median(), inplace = True)

In [0]:
test_df.drop(['Cabin'],axis = 1,inplace= True)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(4)
memory usage: 32.7+ KB


In [0]:
test_df['Embarked'].fillna(test_df['Embarked'].mode()[0], inplace = True)

In [0]:
test_df['Embarked_Code'] = sklearn.preprocessing.LabelEncoder().fit_transform(test_df['Embarked'])

In [0]:
test_df.drop('Embarked',axis=1,inplace=True)

In [0]:
from sklearn.preprocessing import OneHotEncoder
Y0 = np.asarray(test_df[['Embarked_Code']])

x1=OneHotEncoder(sparse=False).fit_transform(Y0)
test_df["S"]=x1[:,0]
test_df["Q"]=x1[:,1]
test_df["C"]=x1[:,2]



In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [0]:
test_df['Title'] = test_df['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

In [0]:
min = 10
initials = (test_df['Title'].value_counts() < min)

test_df['Title'] = test_df['Title'].apply(lambda x: 'Misc' if initials.loc[x] == True else x)
print(test_df['Title'].value_counts())

Mr        240
Miss       78
Mrs        72
Master     21
Misc        7
Name: Title, dtype: int64


In [0]:
port={"Mr":1,"Miss":0,"Mrs":2,"Master":4,"Misc":5}
test_df['Title']=test_df['Title'].map(port)

In [0]:
test_df.drop('Ticket',axis=1,inplace=True)

In [0]:
port={"male":1,"female":0}
test_df['Sex']=test_df['Sex'].map(port)

In [0]:
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked_Code,S,Q,C,Title
0,892,3,"Kelly, Mr. James",1,34.5,0,0,7.8292,1,0.0,1.0,0.0,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,7.0000,2,0.0,0.0,1.0,2
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,9.6875,1,0.0,1.0,0.0,1
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,8.6625,2,0.0,0.0,1.0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,12.2875,2,0.0,0.0,1.0,2
5,897,3,"Svensson, Mr. Johan Cervin",1,14.0,0,0,9.2250,2,0.0,0.0,1.0,1
6,898,3,"Connolly, Miss. Kate",0,30.0,0,0,7.6292,1,0.0,1.0,0.0,0
7,899,2,"Caldwell, Mr. Albert Francis",1,26.0,1,1,29.0000,2,0.0,0.0,1.0,1
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",0,18.0,0,0,7.2292,0,1.0,0.0,0.0,2
9,901,3,"Davies, Mr. John Samuel",1,21.0,2,0,24.1500,2,0.0,0.0,1.0,1


In [0]:
test_df['AGE_SEX']=test_df['Sex']*test_df['Age']

In [0]:
test_df['Pclass_fare']=test_df['Pclass']*test_df['Fare']

In [0]:
test_df['Sb_Parch']=test_df['SibSp']*test_df['Parch']

In [0]:
test_df['A_S3']=(test_df['Age']*test_df['Age'])+(test_df['Sex']*test_df['Sex'])

In [0]:
test_df['A_S4']=(test_df['Age']*test_df['Age'])+test_df['Sex']

In [0]:
test_df.drop('Name',axis=1,inplace=True)

In [0]:
y_test=gen_df['Survived']
y_test

0      0
1      1
2      0
3      0
4      1
5      0
6      1
7      0
8      1
9      0
10     0
11     0
12     1
13     0
14     1
15     1
16     0
17     0
18     1
19     1
20     0
21     0
22     1
23     0
24     1
25     0
26     1
27     0
28     0
29     0
      ..
388    0
389    0
390    0
391    1
392    0
393    0
394    0
395    1
396    0
397    1
398    0
399    0
400    1
401    0
402    1
403    0
404    0
405    0
406    0
407    0
408    1
409    1
410    1
411    1
412    1
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64

In [0]:
X_test= test_df

In [0]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 17 columns):
PassengerId      418 non-null int64
Pclass           418 non-null int64
Sex              418 non-null int64
Age              418 non-null float64
SibSp            418 non-null int64
Parch            418 non-null int64
Fare             418 non-null float64
Embarked_Code    418 non-null int64
S                418 non-null float64
Q                418 non-null float64
C                418 non-null float64
Title            418 non-null int64
AGE_SEX          418 non-null float64
Pclass_fare      418 non-null float64
Sb_Parch         418 non-null int64
A_S3             418 non-null float64
A_S4             418 non-null float64
dtypes: float64(9), int64(8)
memory usage: 55.6 KB


In [0]:
clc=RandomForestClassifier(n_estimators=100, max_depth=3,random_state=64)
clc.fit(X_train1, y_train1)
clc.score(X_test,y_test)

0.9784688995215312

In [0]:
|