In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
train_processed_path = os.path.join('data','train_processed.csv')
test_processed_path = os.path.join('data','test_processed.csv')
train = pd.read_csv(train_processed_path)
test = pd.read_csv(test_processed_path)

In [3]:
test['Fare'].fillna(test['Fare'].mean(), inplace=True)

In [4]:
y = train['Survived']

In [5]:
X = train.drop(columns='Survived')

In [6]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Q,S
0,3,0,22.0,1,0,7.25,5,0,1
1,1,1,38.0,1,0,71.2833,2,0,0
2,3,1,26.0,0,0,7.925,6,0,1
3,1,1,35.0,1,0,53.1,2,0,1
4,3,0,35.0,0,0,8.05,6,0,1


In [7]:
y.value_counts(normalize=True)

0    0.617548
1    0.382452
Name: Survived, dtype: float64

In [65]:
# imports for modelling
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Standard scaler

In [9]:
std_scaler = StandardScaler()

In [10]:
X = std_scaler.fit_transform(X)

# SGD classifier

In [11]:
sgd_clf = SGDClassifier(random_state = 42)
sgd_clf.fit(X, y)

SGDClassifier(random_state=42)

In [12]:
cross_val_score(sgd_clf,X,y,cv=5,scoring='accuracy').mean()

0.7604265854123023

# Decision tree

In [13]:
dec_clf = DecisionTreeClassifier(random_state=42)

In [14]:
cross_val_score(dec_clf, X, y, cv=5, scoring='accuracy').mean()

0.7626928204151591

# Random forest

In [59]:
rnf_clf = RandomForestClassifier(max_depth=20,random_state=42,n_estimators=200,warm_start=True, min_samples_split=2, min_samples_leaf=2)

In [60]:
cross_val_score(rnf_clf, X, y, cv=5, scoring='accuracy').mean()

0.8369453437440487

In [17]:
rnf_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 20,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': True}

In [18]:
n_estimators = np.arange(100,1100,100)
max_features = ['auto','sqrt']
max_depth = np.arange(10,55,5)
min_samples_split = [2,5,10]
min_samples_leaf = [1,2,4]
bootstrap = [True, False]

In [19]:
random_grid = {
    'n_estimators':n_estimators,
    #'max_features':max_features,
    'max_depth':max_depth,
    #'bootstrap':bootstrap
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf
}

In [20]:
rf_random = RandomizedSearchCV(estimator=rnf_clf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=1)

In [None]:
#rf_random.fit(X, y)

In [None]:
#rf_random.best_params_

# Support vector machine

In [28]:
svm_clf = SVC(random_state=42, probability=True, kernel='rbf',C=1, gamma=0.1)

In [29]:
cross_val_score(svm_clf, X, y, cv=5, scoring='accuracy').mean()

0.8290674792103092

In [23]:
svm_clf.get_params()

{'C': 100,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [24]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

In [25]:
svc_random=RandomizedSearchCV(estimator=svm_clf, param_distributions=param_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=1)

In [26]:
#svc_random.fit(X, y)



Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.1s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.1s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.1s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.1s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.1s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.1s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.1s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=   0.1s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=   0.1s
[CV] END .....................C=0.1, gamma=0.001

RandomizedSearchCV(cv=3,
                   estimator=SVC(C=100, probability=True, random_state=42),
                   n_iter=100, n_jobs=1,
                   param_distributions={'C': [0.1, 1, 10, 100, 1000],
                                        'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                                        'kernel': ['rbf']},
                   random_state=42, verbose=2)

In [27]:
#svc_random.best_params_

{'kernel': 'rbf', 'gamma': 0.1, 'C': 1}

# Logistic regression

In [30]:
lgr_clf = LogisticRegression(random_state=42)

In [31]:
cross_val_score(lgr_clf, X, y, cv=5, scoring='accuracy').mean()

0.787443661524789

# Voting classifier

In [32]:
voting_clf = VotingClassifier(
    estimators=[('dt',dec_clf),('rnf',rnf_clf),('svc',svm_clf),('lgr',lgr_clf)],
    voting='soft'
)

In [33]:
cross_val_score(voting_clf, X, y, cv=5, scoring='accuracy').mean()

0.8256776487018346

# Bagging classifier

In [80]:
bag_clf = BaggingClassifier(
    lgr_clf, n_estimators=250, max_samples=100, bootstrap=True, n_jobs=1
)

In [81]:
cross_val_score(bag_clf, X, y, cv=5, scoring='accuracy', verbose=2).mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   1.4s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV] END .................................................... total time=   1.1s
[CV] END .................................................... total time=   1.1s
[CV] END .................................................... total time=   1.0s
[CV] END .................................................... total time=   1.0s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.6s finished


0.7896972005332318

# Different predictions

In [34]:
voting_clf.fit(X, y)

VotingClassifier(estimators=[('dt', DecisionTreeClassifier(random_state=42)),
                             ('rnf',
                              RandomForestClassifier(max_depth=20,
                                                     min_samples_leaf=2,
                                                     min_samples_split=10,
                                                     n_estimators=200,
                                                     random_state=42,
                                                     warm_start=True)),
                             ('svc',
                              SVC(C=1, gamma=0.1, probability=True,
                                  random_state=42)),
                             ('lgr', LogisticRegression(random_state=42))],
                 voting='soft')

In [35]:
voting_clf_pred = voting_clf.predict(test)

In [61]:
rnf_clf.fit(X, y)

RandomForestClassifier(max_depth=20, min_samples_leaf=2, n_estimators=200,
                       random_state=42, warm_start=True)

In [62]:
rnf_clf_pred = rnf_clf.predict(test)

In [73]:
bag_clf.fit(X, y)

BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                  max_samples=100, n_estimators=250, n_jobs=1)

In [74]:
bag_clf_pred = bag_clf.predict(test)

# Finalizing

In [53]:
sample_pred = pd.read_csv(os.path.join('data','gender_submission.csv'))

In [54]:
sample_pred.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [55]:
my_submission = pd.DataFrame()

In [56]:
my_submission['PassengerId'] = sample_pred['PassengerId']

In [75]:
my_submission['Survived'] = bag_clf_pred

In [58]:
my_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [44]:
my_submission.to_csv(os.path.join('data','my_submission.csv'), index=False)

In [76]:
my_submission.Survived.value_counts(normalize=True)

0    0.990431
1    0.009569
Name: Survived, dtype: float64