In [1]:
import pandas as pd

from prepare_data import (
    explained_var,
    explanatory_vars,
    encode_variables,
    fill_gaps,
)

training_data = pd.read_csv('data/train.csv')
training_data.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
233,234,1,3,"Asplund, Miss. Lillian Gertrud",female,5.0,4,2,347077,31.3875,,S
304,305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S
457,458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S
546,547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19.0,1,0,2908,26.0,,S
566,567,0,3,"Stoytcheff, Mr. Ilia",male,19.0,0,0,349205,7.8958,,S


In [2]:

training_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
fill_gaps(training_data)
training_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [4]:
encode_variables(training_data)
training_data.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
19,20,1,3,"Masselmani, Mrs. Fatima",0,28.0,0,0,2649,7.225,,0
357,358,0,2,"Funk, Miss. Annie Clemmer",0,38.0,0,0,237671,13.0,,2
865,866,1,2,"Bystrom, Mrs. (Karolina)",0,42.0,0,0,236852,13.0,,2
705,706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",1,39.0,0,0,250655,26.0,,2
65,66,1,3,"Moubarek, Master. Gerios",1,28.0,1,1,2661,15.2458,,0


In [5]:
training_data = training_data[explanatory_vars+[explained_var]]
training_data.sample(5)

Unnamed: 0,Age,Embarked,Parch,Sex,SibSp,Survived
625,61.0,2,0,1,0,0
96,71.0,0,0,1,0,0
521,22.0,2,0,1,0,0
230,35.0,2,0,0,1,1
70,32.0,2,0,1,0,0


In [6]:
training_data.isna().sum()

Age         0
Embarked    0
Parch       0
Sex         0
SibSp       0
Survived    0
dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
num_test = 0.20
X_all = training_data[explanatory_vars]
y_all = training_data[explained_var]
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

In [8]:
X_train.describe()


Unnamed: 0,Age,Embarked,Parch,Sex,SibSp
count,712.0,712.0,712.0,712.0,712.0
mean,29.189494,1.522472,0.401685,0.640449,0.54073
std,13.155819,0.805781,0.824312,0.480206,1.1256
min,0.42,0.0,0.0,0.0,0.0
25%,22.0,1.0,0.0,0.0,0.0
50%,28.0,2.0,0.0,1.0,0.0
75%,35.0,2.0,0.25,1.0,1.0
max,80.0,2.0,6.0,1.0,8.0


In [9]:
X_test.describe()

Unnamed: 0,Age,Embarked,Parch,Sex,SibSp
count,179.0,179.0,179.0,179.0,179.0
mean,30.046089,1.592179,0.301676,0.675978,0.452514
std,12.475691,0.731433,0.725574,0.469321,1.006569
min,0.75,0.0,0.0,0.0,0.0
25%,23.5,1.0,0.0,0.0,0.0
50%,28.0,2.0,0.0,1.0,0.0
75%,36.0,2.0,0.0,1.0,1.0
max,74.0,2.0,5.0,1.0,8.0


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Random Forest 
rf = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {
  'n_estimators': [4, 6, 9], 
  'max_features': ['log2', 'sqrt','auto'], 
  'criterion': ['entropy', 'gini'],
  'max_depth': [2, 3, 5, 10], 
  'min_samples_split': [2, 3, 5],
  'min_samples_leaf': [1,5,8]
}

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(rf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the rf to the best combination of parameters
rf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=3, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=4,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [11]:
predictions = rf.predict(X_test)
print(accuracy_score(y_test, predictions))


0.7541899441340782


In [12]:
X_test_kaggle = pd.read_csv('data/test.csv')

In [13]:
X_test_kaggle = X_test_kaggle[explanatory_vars]

In [14]:
fill_gaps(X_test_kaggle)
encode_variables(X_test_kaggle)
X_test_kaggle.sample(5)

Unnamed: 0,Age,Embarked,Parch,Sex,SibSp
101,27.0,2,0,1,1
404,43.0,0,0,1,1
26,22.0,0,1,0,0
80,6.0,0,1,1,1
265,27.0,2,0,1,0


In [15]:
rf.predict(X_test_kaggle)  # accuracy is 0.77990

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [16]:
from sklearn.externals import joblib

joblib.dump(rf, 'models/random_forest.pkl')



['models/random_forest.pkl']

In [17]:
from sklearn import neighbors, datasets
knn = neighbors.KNeighborsClassifier()

In [18]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [19]:
parameters = {
  'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
  'leaf_size': list(range(10, 50, 5)), 
  'metric': ['minkowski', 'euclidean'], 
  'p': [1, 2, 3], 
  'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10],
  'weights': ['uniform', 'distance']
}

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(knn, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the rf to the best combination of parameters
knn = grid_obj.best_estimator_



In [20]:
knn.fit(X_train, y_train)
joblib.dump(knn, 'models/knn.pkl')

['models/knn.pkl']

In [21]:
predictions = knn.predict(X_test)
print(accuracy_score(y_test, predictions))

0.7262569832402235


In [22]:
knn.predict(X_test_kaggle)  # accuracy is 0.61722

array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,

In [23]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

logreg.fit(X_train, y_train)
predictions = logreg.predict(X_test)
print(accuracy_score(y_test, predictions))

0.7877094972067039




In [24]:
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
logreg.predict(X_test_kaggle) # 0.77033

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [26]:
joblib.dump(logreg, 'models/logistic_regression.pkl')

['models/logistic_regression.pkl']