In [1]:
import pandas as pd

## Reading the dataset

In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home_dest
0,1,1,Allen Miss. Elisabeth Walton,female,29.0,0,0,24160,211.3375,B5,S,2.0,,St Louis MO
1,1,1,Allison Master. Hudson Trevor,male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,Montreal PQ / Chesterville ON
2,1,0,Allison Miss. Helen Loraine,female,2.0,1,2,113781,151.55,C22 C26,S,,,Montreal PQ / Chesterville ON
3,1,0,Allison Mr. Hudson Joshua Creighton,male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,Montreal PQ / Chesterville ON
4,1,0,Allison Mrs. Hudson J C (Bessie Waldo Daniels),female,25.0,1,2,113781,151.55,C22 C26,S,,,Montreal PQ / Chesterville ON


## Dropping Unwanted Columns

In [4]:
colsToDrop = ['name','cabin','boat','body','home_dest','ticket']

In [5]:
df.drop(colsToDrop, inplace=True,axis=1)

df = df.drop(colsToDrop, axis=1)

In [6]:
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,female,29.0,0,0,211.3375,S
1,1,1,male,0.9167,1,2,151.55,S
2,1,0,female,2.0,1,2,151.55,S
3,1,0,male,30.0,1,2,151.55,S
4,1,0,female,25.0,1,2,151.55,S


## Changing the datatypes

In [7]:
df.dtypes

pclass        int64
survived      int64
sex          object
age         float64
sibsp         int64
parch         int64
fare        float64
embarked     object
dtype: object

In [8]:
df['pclass'] = df['pclass'].astype('category')
#df.pclass = df.pclass.astype('category')

In [9]:
df['sex'] = df['sex'].astype('category')

In [10]:
df['embarked'] = df['embarked'].astype('category')

In [11]:
df['survived'] = df['survived'].astype('category')

In [12]:
df.dtypes

pclass      category
survived    category
sex         category
age          float64
sibsp          int64
parch          int64
fare         float64
embarked    category
dtype: object

In [13]:
df.dtypes

pclass      category
survived    category
sex         category
age          float64
sibsp          int64
parch          int64
fare         float64
embarked    category
dtype: object

## Handling Missing Information

In [13]:
df.isnull().sum()

pclass        0
survived      0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [10]:
df.sibsp.isnull().sum()

0

In [14]:
for col in df:
    if df[col].isnull().sum() > 0:
        if str(df[col].dtype) == 'category':
            df[col] = df[col].fillna(value = df[col].mode()[0])
        else:
            print(col)
            df[col] = df[col].fillna(value = df[col].mean())

age
fare


In [15]:
df.isnull().sum().sum()

0

In [None]:
df.fillna()

## Seperating Target Column

In [16]:
y = df['survived']

In [17]:
X = df.drop(['survived'],axis = 1)

In [18]:
X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
0,1,female,29.0,0,0,211.3375,S
1,1,male,0.9167,1,2,151.55,S
2,1,female,2.0,1,2,151.55,S
3,1,male,30.0,1,2,151.55,S
4,1,female,25.0,1,2,151.55,S


In [19]:
y.head()

0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: category
Categories (2, int64): [0, 1]

In [27]:
X.dtypes

pclass      category
sex         category
age          float64
sibsp          int64
parch          int64
fare         float64
embarked    category
dtype: object

## Converting Categorical to Numeric

In [28]:
X.dtypes

pclass      category
sex         category
age          float64
sibsp          int64
parch          int64
fare         float64
embarked    category
dtype: object

In [20]:
X = pd.get_dummies(X,drop_first=True)

In [21]:
X.head()

Unnamed: 0,age,sibsp,parch,fare,pclass_2,pclass_3,sex_male,embarked_Q,embarked_S
0,29.0,0,0,211.3375,0,0,0,0,1
1,0.9167,1,2,151.55,0,0,1,0,1
2,2.0,1,2,151.55,0,0,0,0,1
3,30.0,1,2,151.55,0,0,1,0,1
4,25.0,1,2,151.55,0,0,0,0,1


## Train Test Split

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test  = train_test_split(X,y,test_size=0.3, random_state=100)

In [24]:
X.shape

(1309, 9)

In [25]:
X_train.shape

(916, 9)

In [18]:
X_test.shape

(393, 9)

## Model Building

In [26]:
from sklearn.svm import SVC

In [42]:
clf = SVC(C=100,kernel='linear', probability=True)

In [43]:
clf.fit(X_train,y_train)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [44]:
preds = clf.predict(X_test)

In [38]:
from sklearn.metrics import classification_report

In [45]:
print(classification_report(y_test,preds))

             precision    recall  f1-score   support

          0       0.80      0.82      0.81       253
          1       0.66      0.63      0.64       140

avg / total       0.75      0.75      0.75       393



In [1]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.ensemble import AdaBoostClassifier

clf = MLPClassifier(hidden_layer_sizes=(15,7),activation='relu',learning_rate='adaptive', alpha=0.5,max_iter=1000)

In [50]:
clf= DecisionTreeClassifier(criterion='gini',max_depth=4,)

In [51]:
clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [52]:
preds = clf.predict(X_test)

In [53]:
preds

array([1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0,

preds = pd.DataFrame(preds)

preds.columns = ['Pred_0','Pred_1']

preds.head()

In [48]:
from sklearn.metrics import classification_report,confusion_matrix

In [49]:
confusion_matrix(y_test,preds)

array([[201,  43],
       [ 39, 110]], dtype=int64)

In [54]:
print(classification_report(y_test,preds))

             precision    recall  f1-score   support

          0       0.83      0.89      0.86       244
          1       0.80      0.70      0.75       149

avg / total       0.82      0.82      0.82       393



In [37]:
clf = RandomForestClassifier()#n_estimators=10,max_depth=

In [57]:
clf = AdaBoostClassifier(n_estimators=50)

In [72]:
clf = GradientBoostingClassifier(n_estimators=1000,verbose=True,max_depth=4,min_samples_leaf=5)

In [58]:
clf.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

## Visualize the tree

In [55]:
with open("decisiontree.dot", 'w') as f:
    f = export_graphviz(clf, out_file=f,feature_names=X.columns.values,filled=True, rounded=True,special_characters=True,class_names=['0','1'], proportion=True)

type the following code in the command line

dot -Tpng decisiontree.dot -o outfile.png

## Measuring Model Performance

In [74]:
from sklearn.metrics import confusion_matrix,recall_score, precision_score

In [75]:
preds = clf.predict(X_test)

In [76]:
confusion_matrix(y_test,preds)

array([[203,  28],
       [ 55, 107]])

In [42]:
recall_score(y_test,preds,pos_label=1)

0.65432098765432101

In [43]:
clf.feature_importances_

array([ 0.24720297,  0.30288213,  0.01457821,  0.0645454 ,  0.24465364,
        0.02105507,  0.00767393,  0.007664  ,  0.00527658,  0.00492541,
        0.0228685 ,  0.0122918 ,  0.00964291,  0.01197513,  0.02276431])

In [44]:
dict(zip(X.columns.values,clf.feature_importances_))

{'age': 0.24720296873525655,
 'embarked_Q': 0.011975134406695235,
 'embarked_S': 0.02276430742300144,
 'fare': 0.30288213408199416,
 'parch_1': 0.02286850162938666,
 'parch_2': 0.012291797766073878,
 'parch_>2': 0.0096429145703076907,
 'pclass_2.0': 0.014578205986743786,
 'pclass_3.0': 0.06454540268792347,
 'sex_male': 0.24465363727830258,
 'sibsp_1': 0.021055072251611281,
 'sibsp_2': 0.0076739297009471089,
 'sibsp_3': 0.0076640025448835008,
 'sibsp_4': 0.0052765848600943143,
 'sibsp_>4': 0.0049254060767784285}

In [77]:
from sklearn.model_selection import cross_val_score

In [89]:
clf = RandomForestClassifier(n_estimators=10,min_samples_leaf=5,max_depth=4,class_weight='balanced')#n_estimators=10,max_depth=

In [90]:
cross_val_score(clf,X,y,cv=4,scoring='recall')

array([ 0.936,  0.792,  0.704,  0.248])

## Bagging

In [27]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report

In [114]:
clf = BaggingClassifier(n_estimators=150,max_samples=0.5,max_features=0.5)

In [115]:
clf.fit(X_train,y_train)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=0.5, max_samples=0.5,
         n_estimators=150, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

In [116]:
preds = clf.predict(X_test)

In [117]:
print(classification_report(y_test,preds))

             precision    recall  f1-score   support

          0       0.82      0.89      0.85       244
          1       0.79      0.68      0.73       149

avg / total       0.81      0.81      0.81       393



In [118]:
print(classification_report(y_train,clf.predict(X_train)))

             precision    recall  f1-score   support

          0       0.88      0.98      0.93       565
          1       0.96      0.78      0.86       351

avg / total       0.91      0.90      0.90       916



In [161]:
X.shape

(1309, 9)

## RandomForest

In [120]:
from sklearn.ensemble import RandomForestClassifier

In [185]:
clf = RandomForestClassifier(max_depth=8,min_samples_leaf=5,class_weight='balanced')

In [193]:
clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=8, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [187]:
preds = clf.predict(X_test)

In [188]:
print(classification_report(y_test,preds))

             precision    recall  f1-score   support

          0       0.83      0.86      0.84       248
          1       0.74      0.69      0.71       145

avg / total       0.79      0.80      0.79       393



In [189]:
print(classification_report(y_train,clf.predict(X_train)))

             precision    recall  f1-score   support

          0       0.87      0.89      0.88       561
          1       0.83      0.79      0.81       355

avg / total       0.85      0.85      0.85       916



In [191]:
list(zip(X.columns.values,clf.feature_importances_))

[('age', 0.129214446807071),
 ('sibsp', 0.036340486683710191),
 ('parch', 0.037401194610135767),
 ('fare', 0.23842383550519775),
 ('pclass_2', 0.011331716189983989),
 ('pclass_3', 0.062893116869657192),
 ('sex_male', 0.4426793245630174),
 ('embarked_Q', 0.010142404507467551),
 ('embarked_S', 0.031573474263759151)]

In [194]:
list(zip(X.columns.values,clf.feature_importances_))

[('age', 0.10450997840171014),
 ('sibsp', 0.039873623148793225),
 ('parch', 0.044285573585112092),
 ('fare', 0.20073542868796185),
 ('pclass_2', 0.02572834822047549),
 ('pclass_3', 0.062500801594725056),
 ('sex_male', 0.50045534190123853),
 ('embarked_Q', 0.0054330399110245229),
 ('embarked_S', 0.016477864548959054)]

In [195]:
from sklearn.linear_model import LogisticRegression

In [196]:
clf =LogisticRegression()

In [197]:
clf2 = BaggingClassifier(base_estimator=clf)

In [199]:
clf2.fit(X_train,y_train)

BaggingClassifier(base_estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

## Boosting

In [54]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [35]:
baseclf = DecisionTreeClassifier(max_depth=3)

In [36]:
clf = AdaBoostClassifier(base_estimator=baseclf,n_estimators=50 )

In [37]:
clf.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=None)

In [38]:
preds = clf.predict(X_test)

In [39]:
from sklearn.metrics import classification_report

In [40]:
print(classification_report(y_test,preds))

             precision    recall  f1-score   support

          0       0.85      0.81      0.83       253
          1       0.68      0.75      0.71       140

avg / total       0.79      0.79      0.79       393



In [43]:
list(zip(clf.feature_importances_,X_train.columns))

[(0.33034454221444631, 'age'),
 (0.033710530882419494, 'sibsp'),
 (0.045384893296309299, 'parch'),
 (0.48000380253744018, 'fare'),
 (0.01673080734502376, 'pclass_2'),
 (0.0063300284941519135, 'pclass_3'),
 (0.065537347051008069, 'sex_male'),
 (0.0, 'embarked_Q'),
 (0.021958048179201176, 'embarked_S')]

## GBM

In [19]:
from sklearn.ensemble import GradientBoostingClassifier

In [56]:
clf = GradientBoostingClassifier(verbose=1,n_estimators=3000,max_depth=5,min_samples_leaf=6)

In [57]:
clf.fit(X_train,y_train)

      Iter       Train Loss   Remaining Time 
         1           1.2470            7.13s
         2           1.1727            7.50s
         3           1.1117            7.02s
         4           1.0609            6.76s
         5           1.0178            6.59s
         6           0.9809            6.50s
         7           0.9468            6.43s
         8           0.9171            6.36s
         9           0.8930            6.31s
        10           0.8693            6.32s
        20           0.7317            6.11s
        30           0.6687            6.74s
        40           0.6253            6.89s
        50           0.5861            7.08s
        60           0.5583            7.11s
        70           0.5335            7.03s
        80           0.4978            7.12s
        90           0.4750            7.18s
       100           0.4552            7.22s
       200           0.2993            7.22s
       300           0.2192            7.06s
       40

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=6, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=3000,
              presort='auto', random_state=None, subsample=1.0, verbose=1,
              warm_start=False)

In [58]:
preds = clf.predict(X_test)

In [50]:
from sklearn.metrics import classification_report

In [51]:
print(classification_report(y_test,preds))

             precision    recall  f1-score   support

          0       0.84      0.77      0.81       253
          1       0.64      0.74      0.69       140

avg / total       0.77      0.76      0.76       393



In [61]:
dict(zip(X_train.columns,clf.feature_importances_))

{'age': 0.39423696196153968,
 'embarked_Q': 0.0052322164897885826,
 'embarked_S': 0.012263224724236096,
 'fare': 0.5002820837532711,
 'parch': 0.018670611336492001,
 'pclass_2': 0.010381771578313478,
 'pclass_3': 0.0089125009149509226,
 'sex_male': 0.022155389779737255,
 'sibsp': 0.027865239461670493}

## XGBoost

In [1]:
from xgboost import XGBClassifier

In [2]:
clf = XGBClassifier()

In [19]:
clf.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [22]:
preds = clf.predict(X_test)

In [21]:
from sklearn.metrics import classification_report

In [24]:
print(classification_report(y_test,preds))

             precision    recall  f1-score   support

          0       0.84      0.89      0.87       253
          1       0.78      0.70      0.74       140

avg / total       0.82      0.82      0.82       393

