In [73]:
# Importing the needed modules
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model, tree, model_selection, ensemble, svm
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

In [2]:
# Load the data
titanic_data_train = pd.read_csv("train.csv")
titanic_data_test = pd.read_csv("test.csv")
titanic_data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic_data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
titanic_data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [5]:
titanic_data_train = titanic_data_train.drop(["Name","Ticket"], axis=1) 
titanic_data_test = titanic_data_test.drop(["Name","Ticket"], axis=1) 
titanic_data_train.head()
titanic_data_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,3,male,34.5,0,0,7.8292,,Q
1,893,3,female,47.0,1,0,7.0,,S
2,894,2,male,62.0,0,0,9.6875,,Q
3,895,3,male,27.0,0,0,8.6625,,S
4,896,3,female,22.0,1,1,12.2875,,S


In [6]:
#Impute the Empty Ages with median values
age_median = (titanic_data_train["Age"].median() + titanic_data_test["Age"].median())/2
titanic_data_train["Age"].fillna(age_median, inplace=True) 
titanic_data_test["Age"].fillna(age_median, inplace=True) 

fare_median = (titanic_data_train["Fare"].median() + titanic_data_test["Fare"].median())/2
titanic_data_train["Fare"].fillna(age_median, inplace=True) 
titanic_data_test["Fare"].fillna(age_median, inplace=True) 

In [7]:
#Encode the Embarked from S, C, Q to 0, 1 , 2
#fill the empty with S

titanic_data_train["Embarked"].fillna("S", inplace=True)
titanic_data_train.loc[titanic_data_train["Embarked"] == "S", "Embarked"] = 0
titanic_data_train.loc[titanic_data_train["Embarked"] == "C", "Embarked"] = 1
titanic_data_train.loc[titanic_data_train["Embarked"] == "Q", "Embarked"] = 2

titanic_data_test.loc[titanic_data_test["Embarked"] == "S", "Embarked"] = 0
titanic_data_test.loc[titanic_data_test["Embarked"] == "C", "Embarked"] = 1
titanic_data_test.loc[titanic_data_test["Embarked"] == "Q", "Embarked"] = 2

In [8]:
#Encode the female to 0 and male to 1
titanic_data_train.loc[titanic_data_train["Sex"] =="male", "Sex"] = 0
titanic_data_train.loc[titanic_data_train["Sex"] =="female", "Sex"] = 1

titanic_data_test.loc[titanic_data_test["Sex"] =="male", "Sex"] = 0
titanic_data_test.loc[titanic_data_test["Sex"] =="female", "Sex"] = 1


In [9]:
#Remove the Cabin numbers from the Data
titanic_data_train = titanic_data_train.drop('Cabin', axis=1)
titanic_data_test = titanic_data_test.drop('Cabin', axis=1)

In [10]:
features = ["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]
target = ["Survived"]

titanic_data_train_features = titanic_data_train[features].values
titanic_data_test_features = titanic_data_test[features].values

titanic_data_train_target = titanic_data_train["Survived"].values

In [11]:
# helper function to print write prediction
def write_prediction(prediction, name):
    PassengerId = np.array(titanic_data_test["PassengerId"]).astype(int)
    solution = pd.DataFrame(prediction, PassengerId, columns = ["Survived"])
    solution.to_csv(name, index_label = ["PassengerId"])

In [12]:
# helper function to print two list with nice view
def print_lists(first_list, second_list):
    for i, j in zip (first_list, second_list):
        print(i , j)

In [32]:
# Decision Tree
decision_tree = tree.DecisionTreeClassifier(
    max_depth = 7,
    min_samples_split = 2,
    random_state = 1)
decision_tree = decision_tree.fit(titanic_data_train_features, titanic_data_train_target)

print_lists(features, list(decision_tree.feature_importances_))
print("Score: " + str(decision_tree.score(titanic_data_train_features, titanic_data_train_target)))
tree.export_graphviz(decision_tree, feature_names=features, out_file="./decision_tree.dot")
prediction = decision_tree.predict(titanic_data_test_features)
write_prediction(prediction, "./decision_tree.csv")

Pclass 0.159915828314649
Age 0.14960708482898344
Sex 0.470987491327831
Fare 0.14388903138357645
SibSp 0.056135843616143176
Parch 0.013881350758776695
Embarked 0.005583369770040022
Score: 0.8832772166105499


In [33]:
dtree_grid = tree.DecisionTreeClassifier(random_state = 1)
max_depth_grid = [5, 6, 7 ,8, 9]
min_samples_split_grid = [2, 3, 4, 6, 7]
param_grid = dict(max_depth = max_depth_grid, min_samples_split= min_samples_split_grid)
grid = GridSearchCV(estimator=dtree_grid, param_grid=param_grid, cv = 3, n_jobs=-1)
grid_result = grid.fit(titanic_data_train_features, titanic_data_train_target)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.809203 using {'max_depth': 6, 'min_samples_split': 4}


In [39]:
# Random Forest
forest = ensemble.RandomForestClassifier(
    max_depth = 7,
    min_samples_split = 4,
    n_estimators = 1000,
    random_state = 1,
    n_jobs = -1
)
forest = forest.fit(titanic_data_train_features, titanic_data_train_target)
print_lists(features, list(forest.feature_importances_))
print("Score: " + str(forest.score(titanic_data_train_features, titanic_data_train_target)))
prediction_forest = forest.predict(titanic_data_test_features)
write_prediction(prediction_forest, "./random_forest.csv")

Pclass 0.12600499406620502
Age 0.14560408593574958
Sex 0.4022495789273548
Fare 0.19800176188588509
SibSp 0.054664999287126446
Parch 0.04046247998908714
Embarked 0.033012099908591976
Score: 0.8843995510662177


In [36]:
forest_grid = ensemble.RandomForestClassifier()
n_estimators_grid = [10, 50, 200, 1000, 2500, 5000]
max_depth_grid = [5,6,7,8,9]
param_grid = dict(n_estimators = n_estimators_grid, max_depth= max_depth_grid)
forest_grid = GridSearchCV(estimator=forest_grid, param_grid=param_grid, cv = 3, n_jobs=-1)
forest_grid_result = forest_grid.fit(titanic_data_train_features, titanic_data_train_target)
print("Best: %f using %s" % (forest_grid_result.best_score_, forest_grid_result.best_params_))

Best: 0.827160 using {'max_depth': 7, 'n_estimators': 50}


In [20]:
# Support Vector Machine
support_vector_machine = svm.SVR(kernel='linear')
support_vector_machine.fit(titanic_data_train_features, titanic_data_train_target)
print("Score: " + str(support_vector_machine.score(titanic_data_train_features, titanic_data_train_target)))
support_vector_machine_prediction = support_vector_machine.predict(titanic_data_test_features)
write_prediction(support_vector_machine_prediction, "./support_vector_machine.csv")

Score: 0.24255126879148092


In [65]:
#Logistic Regression
lr = LogisticRegression(penalty='l2')
dual_grid=[True,False]
max_iter_grid=[200, 300, 400]
lr_param_grid = dict(dual=dual_grid, max_iter=max_iter_grid)
lr_grid = GridSearchCV(estimator=lr, param_grid=lr_param_grid, cv = 3, n_jobs=-1)
lr_grid_result = lr_grid.fit(titanic_data_train_features, titanic_data_train_target)
print("Best: %f using %s" % (lr_grid_result.best_score_, lr_grid_result.best_params_))

#print("Score: " + str(support_vector_machine.score(titanic_data_train_features, titanic_data_train_target)))
lr_prediction = lr_grid_result.predict(titanic_data_test_features)
write_prediction(lr_prediction, "./logistic_regression_grid.csv")


Best: 0.791246 using {'dual': True, 'max_iter': 300}




In [68]:
#Logistic Regression
lr = LogisticRegression(penalty='l2')
lr_result = lr.fit(titanic_data_train_features, titanic_data_train_target)

print("Score: " + str(lr_result.score(titanic_data_train_features, titanic_data_train_target)))
lr_prediction = lr_grid_result.predict(titanic_data_test_features)
write_prediction(lr_prediction, "./logistic_regression.csv")


Score: 0.7991021324354658




In [72]:
# KNN
knn = KNeighborsClassifier()
knn = knn.fit(titanic_data_train_features, titanic_data_train_target)

print("Score: " + str(knn.score(titanic_data_train_features, titanic_data_train_target)))
knn_prediction = knn.predict(titanic_data_test_features)
write_prediction(knn_prediction, "./knn.csv")

Score: 0.797979797979798


In [79]:
#XGboost
xgb = XGBClassifier()
xgb = xgb.fit(titanic_data_train_features, titanic_data_train_target)
print("Score: " + str(xgb.score(titanic_data_train_features, titanic_data_train_target)))
xgb_prediction = xgb.predict(titanic_data_test_features)
write_prediction(xgb_prediction, "./xgb.csv")

Score: 0.8731762065095399
