In [3]:
import pandas as pd
import numpy as np

In [4]:
titanic = pd.read_csv('train.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic_sur = titanic[titanic['Survived']==1]
print(titanic.shape[0])
titanic.Embarked.fillna('S',inplace=True)
titanic.isnull().sum(axis=0)
# np.sum(titanic_sur.Cabin.isna())/np.sum(titanic.Cabin.isna())

891


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [6]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=43)
for train_idx,test_idx in split.split(titanic,titanic['Pclass']):
    strat_train_set = titanic.loc[train_idx]
    strat_test_set = titanic.loc[test_idx]

In [7]:
train_set = strat_train_set.drop('Survived',axis=1)
train_labels = strat_train_set.Survived
test_set = strat_test_set.drop('Survived',axis=1)
test_labels = strat_test_set.Survived

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6
class combined(BaseEstimator,TransformerMixin):
    def __init__(self):
        return
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        if self.add_bpr:
            bedrooms_per_room = X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_houseold,population_per_household,bedrooms_per_room]
        return np.c_[X,rooms_per_houseold,population_per_household]


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('std_scaler',StandardScaler())
])

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
num_attribs = ["Pclass","Age","SibSp","Parch","Fare"]
cat_attribs = ["Sex","Embarked"]
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(), cat_attribs),
])
train_prepared = full_pipeline.fit_transform(train_set)

In [11]:
train_prepared[:2]

array([[ 0.8282276 , -2.16601659, -0.47512962,  2.0087859 , -0.33425239,
         1.        ,  0.        ,  1.        ,  0.        ,  0.        ],
       [-0.36791449,  1.69853865, -0.47512962, -0.46570469, -0.3896487 ,
         0.        ,  1.        ,  0.        ,  0.        ,  1.        ]])

In [12]:
from scipy import stats
stats.describe(train_prepared)

DescribeResult(nobs=712, minmax=(array([-1.56405658, -2.20996643, -0.47512962, -0.46570469, -0.65231498,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ]), array([0.8282276 , 3.82025525, 6.83929819, 6.95776708, 9.69934676,
       1.        , 1.        , 1.        , 1.        , 1.        ])), mean=array([ 2.36546114e-16,  1.92768710e-17, -8.12396342e-17,  8.99717255e-17,
        3.29792373e-17,  3.46910112e-01,  6.53089888e-01,  1.95224719e-01,
        9.12921348e-02,  7.13483146e-01]), variance=array([1.00140647, 1.00140647, 1.00140647, 1.00140647, 1.00140647,
       0.22688214, 0.22688214, 0.157333  , 0.08307456, 0.20471246]), skewness=array([-0.62714894,  0.52534146,  3.59211155,  2.805821  ,  4.52092953,
        0.64325259, -0.64325259,  1.53781828,  2.8380116 , -0.94433621]), kurtosis=array([-1.28311576,  0.90959069, 16.88119   , 10.17822253, 29.69184508,
       -1.58622611, -1.58622611,  0.36488506,  6.05430983, -1.10822912]))

In [13]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(train_prepared, train_labels)

SGDClassifier(random_state=42)

In [14]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(train_prepared,train_labels)


RandomForestClassifier(random_state=42)

In [37]:
param_grid = {
'n_estimators': [100, 150, 200],
'max_depth': [10, 12,15,18],
'min_samples_leaf': [4,5,6,7],
}

In [38]:
from sklearn.model_selection import RandomizedSearchCV

cv = RandomizedSearchCV(forest_clf,param_grid,n_iter=100,cv=10,n_jobs=-1)
# cv.best_estimator_.best_params_

In [39]:
cv.fit(train_prepared,train_labels)
print(cv.best_score_,cv.best_params_)

0.8216353677621283 {'n_estimators': 150, 'min_samples_leaf': 5, 'max_depth': 10}


In [42]:
from sklearn.metrics import precision_score,recall_score
test_prepared = full_pipeline.transform(test_set)
sur_prediction = cv.predict(test_prepared)
print(precision_score(test_labels,sur_prediction),recall_score(test_labels,sur_prediction))


0.9047619047619048 0.8382352941176471


In [41]:
titanic_labels = titanic.Survived
titanic_prepared = full_pipeline.transform(titanic)
cv.fit(titanic_prepared,titanic_labels)

RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': [10, 12, 15, 18],
                                        'min_samples_leaf': [4, 5, 6, 7],
                                        'n_estimators': [100, 150, 200]})

In [44]:
submit_set = pd.read_csv('test.csv')
submit_set = full_pipeline.transform(submit_set)
submit_predictions = cv.predict(submit_set)