In [None]:
# Importing the needed modules
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model, tree, model_selection, ensemble, svm
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

In [None]:
# Load the data
titanic_data_train = pd.read_csv("./datasets/train.csv")
titanic_data_test = pd.read_csv("./datasets/test.csv")
titanic_data_train.head()

In [None]:
titanic_data_train.info()

In [None]:
titanic_data_test.info()

In [3]:
titanic_data_train = titanic_data_train.drop(["Name","Ticket"], axis=1) 
titanic_data_test = titanic_data_test.drop(["Name","Ticket"], axis=1) 

In [4]:
#Impute the Empty Ages with median values
age_median = (titanic_data_train["Age"].median() + titanic_data_test["Age"].median())/2
titanic_data_train["Age"].fillna(age_median, inplace=True) 
titanic_data_test["Age"].fillna(age_median, inplace=True) 

fare_median = (titanic_data_train["Fare"].median() + titanic_data_test["Fare"].median())/2
titanic_data_train["Fare"].fillna(age_median, inplace=True) 
titanic_data_test["Fare"].fillna(age_median, inplace=True) 

In [5]:
#Encode the Embarked from S, C, Q to 0, 1 , 2
#fill the empty with S

titanic_data_train["Embarked"].fillna("S", inplace=True)
titanic_data_train.loc[titanic_data_train["Embarked"] == "S", "Embarked"] = 0
titanic_data_train.loc[titanic_data_train["Embarked"] == "C", "Embarked"] = 1
titanic_data_train.loc[titanic_data_train["Embarked"] == "Q", "Embarked"] = 2

titanic_data_test.loc[titanic_data_test["Embarked"] == "S", "Embarked"] = 0
titanic_data_test.loc[titanic_data_test["Embarked"] == "C", "Embarked"] = 1
titanic_data_test.loc[titanic_data_test["Embarked"] == "Q", "Embarked"] = 2

In [6]:
#Encode the female to 0 and male to 1
titanic_data_train.loc[titanic_data_train["Sex"] =="male", "Sex"] = 0
titanic_data_train.loc[titanic_data_train["Sex"] =="female", "Sex"] = 1

titanic_data_test.loc[titanic_data_test["Sex"] =="male", "Sex"] = 0
titanic_data_test.loc[titanic_data_test["Sex"] =="female", "Sex"] = 1


In [7]:
#Combine the features Parch and Sibs together to show if somebody is alone or not
data = [titanic_data_train, titanic_data_test]
for dataset in data:
    dataset['relatives'] = dataset['SibSp'] + dataset['Parch']

In [8]:
# Convert the fare feature from float to int
data = [titanic_data_train, titanic_data_test]
for dataset in data:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare']   = 3
    dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare']   = 4
    dataset.loc[ dataset['Fare'] > 250, 'Fare'] = 5
    dataset['Fare'] = dataset['Fare'].astype(int)

In [9]:
# Convert Age from float to integer
data = [titanic_data_train, titanic_data_test]
for dataset in data:
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5
    dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6
    dataset.loc[ dataset['Age'] > 66, 'Age'] = 6

In [None]:
#Remove the Cabin numbers from the Data
titanic_data_train = titanic_data_train.drop('Cabin', axis=1)
titanic_data_test = titanic_data_test.drop('Cabin', axis=1)

In [10]:
# Create additional feature
# Age_Class = Age * Class
data = [titanic_data_train, titanic_data_test]
for dataset in data:
    dataset['Age_Class']= dataset['Age']* dataset['Pclass']

In [None]:
titanic_data_train.head()
titanic_data_test.head()

In [11]:
features = ["Pclass", "Age", "Sex", "Fare", "SibSp", "Embarked", "Age_Class", "relatives"]
target = ["Survived"]

titanic_data_train_features = titanic_data_train[features].values
titanic_data_test_features = titanic_data_test[features].values

titanic_data_train_target = titanic_data_train["Survived"].values

In [12]:
# helper function to print write prediction
def write_prediction(prediction, name):
    PassengerId = np.array(titanic_data_test["PassengerId"]).astype(int)
    solution = pd.DataFrame(prediction, PassengerId, columns = ["Survived"])
    solution.to_csv(name, index_label = ["PassengerId"])

In [13]:
# helper function to print two list with nice view
def print_lists(first_list, second_list):
    for i, j in zip (first_list, second_list):
        print(i , j)

In [None]:
# Decision Tree
decision_tree = tree.DecisionTreeClassifier(
    max_depth = 7,
    min_samples_split = 2,
    random_state = 1)
decision_tree = decision_tree.fit(titanic_data_train_features, titanic_data_train_target)

print_lists(features, list(decision_tree.feature_importances_))
print("Score: " + str(decision_tree.score(titanic_data_train_features, titanic_data_train_target)))
tree.export_graphviz(decision_tree, feature_names=features, out_file="./decision_tree.dot")
prediction = decision_tree.predict(titanic_data_test_features)
write_prediction(prediction, "./decision_tree.csv")

In [None]:
dtree_grid = tree.DecisionTreeClassifier(random_state = 1)
max_depth_grid = [5, 6, 7 ,8, 9]
min_samples_split_grid = [2, 3, 4, 6, 7]
param_grid = dict(max_depth = max_depth_grid, min_samples_split= min_samples_split_grid)
grid = GridSearchCV(estimator=dtree_grid, param_grid=param_grid, cv = 3, n_jobs=-1)
grid_result = grid.fit(titanic_data_train_features, titanic_data_train_target)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
# Random Forest
random_forest = ensemble.RandomForestClassifier(criterion = "gini", 
                                       min_samples_leaf = 1, 
                                       min_samples_split = 10,   
                                       n_estimators=100, 
                                       max_features='auto', 
                                       oob_score=True, 
                                       random_state=1, 
                                       n_jobs=-1)
random_forest = random_forest.fit(titanic_data_train_features, titanic_data_train_target)
print_lists(features, list(random_forest.feature_importances_))
print("Score: " + str(random_forest.score(titanic_data_train_features, titanic_data_train_target)))
prediction_forest = random_forest.predict(titanic_data_test_features)
write_prediction(prediction_forest, "./predictions/random_forest.csv")

In [None]:
forest_grid = ensemble.RandomForestClassifier()
n_estimators_grid = [10, 50, 200, 1000, 2500]
max_depth_grid = [5,6,7]
param_grid = dict(n_estimators = n_estimators_grid, max_depth= max_depth_grid)
forest_grid = GridSearchCV(estimator=forest_grid, param_grid=param_grid, cv = 3, n_jobs=-1)
forest_grid_result = forest_grid.fit(titanic_data_train_features, titanic_data_train_target)
print("Best: %f using %s" % (forest_grid_result.best_score_, forest_grid_result.best_params_))

In [None]:
# Support Vector Machine
support_vector_machine = svm.SVR(kernel='linear')
support_vector_machine.fit(titanic_data_train_features, titanic_data_train_target)
print("Score: " + str(support_vector_machine.score(titanic_data_train_features, titanic_data_train_target)))
support_vector_machine_prediction = support_vector_machine.predict(titanic_data_test_features)
write_prediction(support_vector_machine_prediction, "./predictions/support_vector_machine.csv")

In [None]:
#Logistic Regression
lr = LogisticRegression(penalty='l2')
dual_grid=[True,False]
max_iter_grid=[200, 300, 400]
lr_param_grid = dict(dual=dual_grid, max_iter=max_iter_grid)
lr_grid = GridSearchCV(estimator=lr, param_grid=lr_param_grid, cv = 3, n_jobs=-1)
lr_grid_result = lr_grid.fit(titanic_data_train_features, titanic_data_train_target)
print("Best: %f using %s" % (lr_grid_result.best_score_, lr_grid_result.best_params_))

#print("Score: " + str(support_vector_machine.score(titanic_data_train_features, titanic_data_train_target)))
lr_prediction = lr_grid_result.predict(titanic_data_test_features)
write_prediction(lr_prediction, "./predictions/logistic_regression_grid.csv")


In [None]:
#Logistic Regression
lr = LogisticRegression(penalty='l2')
lr_result = lr.fit(titanic_data_train_features, titanic_data_train_target)

print("Score: " + str(lr_result.score(titanic_data_train_features, titanic_data_train_target)))
lr_prediction = lr_grid_result.predict(titanic_data_test_features)
write_prediction(lr_prediction, "./predictions/logistic_regression.csv")


In [None]:
# KNN
knn = KNeighborsClassifier()
knn = knn.fit(titanic_data_train_features, titanic_data_train_target)

print("Score: " + str(knn.score(titanic_data_train_features, titanic_data_train_target)))
knn_prediction = knn.predict(titanic_data_test_features)
write_prediction(knn_prediction, "./predictions/knn.csv")

In [14]:
#XGboost Grid Search Hyper Parameters
xgb_hyperparameters = {
"learning_rate"    : [0.05, 0.15] ,
 "max_depth"        : [ 3, 5, 15],
 "min_child_weight" : [ 1, 3, 5],
 "gamma"            : [ 0.2 , 0.4 ],
 "colsample_bytree" : [ 0.3, 0.5 ] 
}
xgb = XGBClassifier()
xgb_grid = GridSearchCV(estimator=xgb, param_grid=xgb_hyperparameters, cv = 3)
xgb_grid_result = xgb_grid.fit(titanic_data_train_features, titanic_data_train_target)
print("Best: %f using %s" % (xgb_grid_result.best_score_, xgb_grid_result.best_params_))
#xgb = xgb.fit(titanic_data_train_features, titanic_data_train_target)
#print("Score: " + str(xgb.score(titanic_data_train_features, titanic_data_train_target)))
#xgb_prediction = xgb.predict(titanic_data_test_features)
#write_prediction(xgb_prediction, "./predictions/xgb.csv")

Best: 0.828283 using {'colsample_bytree': 0.5, 'gamma': 0.2, 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 3}


In [19]:
xgb = XGBClassifier(colsample_bytree= 0.5, gamma= 0.2, learning_rate= 0.05, max_depth= 3, min_child_weight= 3)
xgb = xgb.fit(titanic_data_train_features, titanic_data_train_target)
print("Score: " + str(xgb.score(titanic_data_train_features, titanic_data_train_target)))
xgb_prediction = xgb.predict(titanic_data_test_features)
write_prediction(xgb_prediction, "./predictions/xgb.csv")

Score: 0.8473625140291807


In [None]:
predictions = cross_val_predict(random_forest, titanic_data_train_features, titanic_data_train_target, cv=3)
confusion_matrix(titanic_data_train_target, predictions)

In [None]:
#Precision and Recall
print("Precision:", precision_score(titanic_data_train_target, predictions))
print("Recall:",recall_score(titanic_data_train_target, predictions))