In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,OneHotEncoder,MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train['isCabin'] = df_train['Cabin'].isna().apply(lambda X:0 if X else 1)
df_train['Title'] = df_train['Name'].apply(lambda X:X.split('.')[0].split(' ')[1])
titles = df_train['Title'].value_counts().index[:6]
df_train['Title'] = df_train['Title'].apply(lambda X:X if X in titles else 'other')

In [36]:
df_train['Title'].value_counts()


Mr        502
Miss      179
Mrs       121
Master     40
other      36
Dr          7
Rev         6
Name: Title, dtype: int64

In [5]:
numeric_feature= list(df_train.select_dtypes(exclude='object').columns)
numeric_feature

['PassengerId',
 'Survived',
 'Pclass',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'isCabin']

In [6]:
numeric_feature.remove('PassengerId')
numeric_feature.remove('Survived')

In [7]:
cat_feature= list(df_train.select_dtypes(include='object').columns)
cat_feature

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Title']

In [8]:
cat_feature.remove('Name')
cat_feature.remove('Ticket')
cat_feature.remove('Cabin')

In [9]:
imputer=SimpleImputer(strategy='median')
imputer.fit(df_train[numeric_feature])

SimpleImputer(strategy='median')

In [10]:
df_train[numeric_feature]=imputer.transform(df_train[numeric_feature])

In [11]:
df_train[numeric_feature].isnull().sum()

Pclass     0
Age        0
SibSp      0
Parch      0
Fare       0
isCabin    0
dtype: int64

In [12]:
imputer=SimpleImputer(strategy='most_frequent')
imputer.fit(df_train[cat_feature])
df_train[cat_feature]=imputer.transform(df_train[cat_feature])

In [13]:
scaler= MinMaxScaler()
scaler.fit(df_train[numeric_feature])

MinMaxScaler()

In [14]:
df_train[numeric_feature]=scaler.transform(df_train[numeric_feature])

In [15]:
ohe= OneHotEncoder(handle_unknown='ignore')
ohe.fit(df_train[cat_feature])
ohe.transform(df_train[cat_feature]).toarray()

array([[0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [16]:
cat_onecode=ohe.get_feature_names()

In [17]:
train=pd.DataFrame(data=ohe.transform(df_train[cat_feature]).toarray(),columns=cat_onecode)

In [18]:
df_test


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [19]:
df_test['isCabin'] = df_test['Cabin'].isna().apply(lambda X:0 if X else 1)
df_test['Title'] = df_test['Name'].apply(lambda X:X.split('.')[0].split(' ')[1])
titles = df_test['Title'].value_counts().index[:6]
df_test['Title'] = df_test['Title'].apply(lambda X:X if X in titles else 'other')

In [21]:
df_test['Title'].value_counts()

Mr        234
Miss       77
Mrs        70
Master     19
other      12
y           4
Col         2
Name: Title, dtype: int64

In [22]:
numeric_feature= list(df_test.select_dtypes(exclude='object').columns)

In [23]:
numeric_feature.remove('PassengerId')

In [24]:
cat_feature= list(df_test.select_dtypes(include='object').columns)
cat_feature

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Title']

In [25]:
cat_feature.remove('Name')
cat_feature.remove('Ticket')
cat_feature.remove('Cabin')

In [26]:
imputer.fit(df_test[numeric_feature])
df_test[numeric_feature]=imputer.transform(df_test[numeric_feature])

In [27]:
imputer.fit(df_test[cat_feature])
df_test[cat_feature]=imputer.transform(df_test[cat_feature])

In [28]:
scaler.fit(df_test[numeric_feature])

MinMaxScaler()

In [29]:
df_test[numeric_feature]=scaler.transform(df_test[numeric_feature])

In [30]:
ohe.fit(df_test[cat_feature])
ohe.transform(df_test[cat_feature]).toarray()
cat_onecode=ohe.get_feature_names()
test=pd.DataFrame(data=ohe.transform(df_test[cat_feature]).toarray(),columns=cat_onecode)

In [31]:
train.shape,test.shape

((891, 12), (418, 12))

In [32]:
final_train=pd.concat([df_train[numeric_feature],train],axis=1)
final_test=pd.concat([df_test[numeric_feature],test],axis=1)

In [33]:
final_train.shape,final_test.shape
x=final_train
y=df_train['Survived']

In [31]:
from sklearn.model_selection import GridSearchCV

In [32]:
parametrs = {'n_estimators':[10,50,100,200],'max_depth':[2,3,4,5,6,7], 'min_samples_leaf':[1,2,3,4,5]}
grid_cv = GridSearchCV(RandomForestClassifier(random_state=100), param_grid=parametrs, verbose=5)
grid_cv.fit(final_train,df_train['Survived'])

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV 1/5] END max_depth=2, min_samples_leaf=1, n_estimators=10;, score=0.816 total time=   0.1s
[CV 2/5] END max_depth=2, min_samples_leaf=1, n_estimators=10;, score=0.815 total time=   0.0s
[CV 3/5] END max_depth=2, min_samples_leaf=1, n_estimators=10;, score=0.792 total time=   0.0s
[CV 4/5] END max_depth=2, min_samples_leaf=1, n_estimators=10;, score=0.753 total time=   0.0s
[CV 5/5] END max_depth=2, min_samples_leaf=1, n_estimators=10;, score=0.787 total time=   0.0s
[CV 1/5] END max_depth=2, min_samples_leaf=1, n_estimators=50;, score=0.799 total time=   0.0s
[CV 2/5] END max_depth=2, min_samples_leaf=1, n_estimators=50;, score=0.803 total time=   0.0s
[CV 3/5] END max_depth=2, min_samples_leaf=1, n_estimators=50;, score=0.787 total time=   0.0s
[CV 4/5] END max_depth=2, min_samples_leaf=1, n_estimators=50;, score=0.753 total time=   0.0s
[CV 5/5] END max_depth=2, min_samples_leaf=1, n_estimators=50;, score=0.787 total 

[CV 4/5] END max_depth=2, min_samples_leaf=5, n_estimators=50;, score=0.753 total time=   0.0s
[CV 5/5] END max_depth=2, min_samples_leaf=5, n_estimators=50;, score=0.787 total time=   0.0s
[CV 1/5] END max_depth=2, min_samples_leaf=5, n_estimators=100;, score=0.799 total time=   0.3s
[CV 2/5] END max_depth=2, min_samples_leaf=5, n_estimators=100;, score=0.815 total time=   0.2s
[CV 3/5] END max_depth=2, min_samples_leaf=5, n_estimators=100;, score=0.792 total time=   0.1s
[CV 4/5] END max_depth=2, min_samples_leaf=5, n_estimators=100;, score=0.753 total time=   0.1s
[CV 5/5] END max_depth=2, min_samples_leaf=5, n_estimators=100;, score=0.787 total time=   0.1s
[CV 1/5] END max_depth=2, min_samples_leaf=5, n_estimators=200;, score=0.799 total time=   0.3s
[CV 2/5] END max_depth=2, min_samples_leaf=5, n_estimators=200;, score=0.809 total time=   0.3s
[CV 3/5] END max_depth=2, min_samples_leaf=5, n_estimators=200;, score=0.792 total time=   0.3s
[CV 4/5] END max_depth=2, min_samples_leaf

[CV 5/5] END max_depth=3, min_samples_leaf=4, n_estimators=100;, score=0.803 total time=   0.1s
[CV 1/5] END max_depth=3, min_samples_leaf=4, n_estimators=200;, score=0.821 total time=   0.3s
[CV 2/5] END max_depth=3, min_samples_leaf=4, n_estimators=200;, score=0.815 total time=   0.3s
[CV 3/5] END max_depth=3, min_samples_leaf=4, n_estimators=200;, score=0.787 total time=   0.3s
[CV 4/5] END max_depth=3, min_samples_leaf=4, n_estimators=200;, score=0.775 total time=   0.3s
[CV 5/5] END max_depth=3, min_samples_leaf=4, n_estimators=200;, score=0.809 total time=   0.3s
[CV 1/5] END max_depth=3, min_samples_leaf=5, n_estimators=10;, score=0.827 total time=   0.0s
[CV 2/5] END max_depth=3, min_samples_leaf=5, n_estimators=10;, score=0.820 total time=   0.0s
[CV 3/5] END max_depth=3, min_samples_leaf=5, n_estimators=10;, score=0.809 total time=   0.0s
[CV 4/5] END max_depth=3, min_samples_leaf=5, n_estimators=10;, score=0.775 total time=   0.0s
[CV 5/5] END max_depth=3, min_samples_leaf=5

[CV 2/5] END max_depth=4, min_samples_leaf=4, n_estimators=50;, score=0.815 total time=   0.0s
[CV 3/5] END max_depth=4, min_samples_leaf=4, n_estimators=50;, score=0.820 total time=   0.0s
[CV 4/5] END max_depth=4, min_samples_leaf=4, n_estimators=50;, score=0.781 total time=   0.0s
[CV 5/5] END max_depth=4, min_samples_leaf=4, n_estimators=50;, score=0.831 total time=   0.0s
[CV 1/5] END max_depth=4, min_samples_leaf=4, n_estimators=100;, score=0.832 total time=   0.1s
[CV 2/5] END max_depth=4, min_samples_leaf=4, n_estimators=100;, score=0.815 total time=   0.1s
[CV 3/5] END max_depth=4, min_samples_leaf=4, n_estimators=100;, score=0.820 total time=   0.1s
[CV 4/5] END max_depth=4, min_samples_leaf=4, n_estimators=100;, score=0.781 total time=   0.1s
[CV 5/5] END max_depth=4, min_samples_leaf=4, n_estimators=100;, score=0.826 total time=   0.1s
[CV 1/5] END max_depth=4, min_samples_leaf=4, n_estimators=200;, score=0.832 total time=   0.5s
[CV 2/5] END max_depth=4, min_samples_leaf=4

[CV 3/5] END max_depth=5, min_samples_leaf=3, n_estimators=100;, score=0.826 total time=   0.3s
[CV 4/5] END max_depth=5, min_samples_leaf=3, n_estimators=100;, score=0.787 total time=   0.2s
[CV 5/5] END max_depth=5, min_samples_leaf=3, n_estimators=100;, score=0.865 total time=   0.1s
[CV 1/5] END max_depth=5, min_samples_leaf=3, n_estimators=200;, score=0.827 total time=   0.5s
[CV 2/5] END max_depth=5, min_samples_leaf=3, n_estimators=200;, score=0.820 total time=   0.3s
[CV 3/5] END max_depth=5, min_samples_leaf=3, n_estimators=200;, score=0.826 total time=   0.3s
[CV 4/5] END max_depth=5, min_samples_leaf=3, n_estimators=200;, score=0.781 total time=   0.3s
[CV 5/5] END max_depth=5, min_samples_leaf=3, n_estimators=200;, score=0.865 total time=   0.3s
[CV 1/5] END max_depth=5, min_samples_leaf=4, n_estimators=10;, score=0.827 total time=   0.0s
[CV 2/5] END max_depth=5, min_samples_leaf=4, n_estimators=10;, score=0.820 total time=   0.0s
[CV 3/5] END max_depth=5, min_samples_leaf

[CV 4/5] END max_depth=6, min_samples_leaf=2, n_estimators=200;, score=0.787 total time=   0.3s
[CV 5/5] END max_depth=6, min_samples_leaf=2, n_estimators=200;, score=0.865 total time=   0.3s
[CV 1/5] END max_depth=6, min_samples_leaf=3, n_estimators=10;, score=0.810 total time=   0.0s
[CV 2/5] END max_depth=6, min_samples_leaf=3, n_estimators=10;, score=0.815 total time=   0.0s
[CV 3/5] END max_depth=6, min_samples_leaf=3, n_estimators=10;, score=0.803 total time=   0.0s
[CV 4/5] END max_depth=6, min_samples_leaf=3, n_estimators=10;, score=0.781 total time=   0.0s
[CV 5/5] END max_depth=6, min_samples_leaf=3, n_estimators=10;, score=0.848 total time=   0.0s
[CV 1/5] END max_depth=6, min_samples_leaf=3, n_estimators=50;, score=0.816 total time=   0.0s
[CV 2/5] END max_depth=6, min_samples_leaf=3, n_estimators=50;, score=0.831 total time=   0.0s
[CV 3/5] END max_depth=6, min_samples_leaf=3, n_estimators=50;, score=0.831 total time=   0.0s
[CV 4/5] END max_depth=6, min_samples_leaf=3, n_

[CV 5/5] END max_depth=7, min_samples_leaf=2, n_estimators=10;, score=0.860 total time=   0.0s
[CV 1/5] END max_depth=7, min_samples_leaf=2, n_estimators=50;, score=0.827 total time=   0.1s
[CV 2/5] END max_depth=7, min_samples_leaf=2, n_estimators=50;, score=0.820 total time=   0.0s
[CV 3/5] END max_depth=7, min_samples_leaf=2, n_estimators=50;, score=0.826 total time=   0.0s
[CV 4/5] END max_depth=7, min_samples_leaf=2, n_estimators=50;, score=0.798 total time=   0.0s
[CV 5/5] END max_depth=7, min_samples_leaf=2, n_estimators=50;, score=0.876 total time=   0.0s
[CV 1/5] END max_depth=7, min_samples_leaf=2, n_estimators=100;, score=0.821 total time=   0.1s
[CV 2/5] END max_depth=7, min_samples_leaf=2, n_estimators=100;, score=0.820 total time=   0.1s
[CV 3/5] END max_depth=7, min_samples_leaf=2, n_estimators=100;, score=0.837 total time=   0.1s
[CV 4/5] END max_depth=7, min_samples_leaf=2, n_estimators=100;, score=0.787 total time=   0.1s
[CV 5/5] END max_depth=7, min_samples_leaf=2, 

GridSearchCV(estimator=RandomForestClassifier(random_state=100),
             param_grid={'max_depth': [2, 3, 4, 5, 6, 7],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'n_estimators': [10, 50, 100, 200]},
             verbose=5)

In [33]:
grid_cv.best_estimator_

RandomForestClassifier(max_depth=5, random_state=100)

In [43]:
model= RandomForestClassifier(criterion='entropy',n_estimators=100,max_depth=6,random_state=100)

In [44]:
model.fit(final_train,df_train['Survived'])

RandomForestClassifier(criterion='entropy', max_depth=6, random_state=100)

In [45]:
yp=model.predict(final_test)

In [46]:
yp

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [47]:
df_test['Survived']=yp

In [48]:
df_test[['PassengerId','Survived']]

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [49]:
df_test[['PassengerId','Survived']].to_csv('SUBMISSION03.csv',index=False)