In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import math

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
survived = train[train["Survived"] == 1]
died = train[train["Survived"] == 0]

In [6]:
survived.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,342.0,342.0,342.0,290.0,342.0,342.0,342.0
mean,444.368421,1.0,1.950292,28.34369,0.473684,0.464912,48.395408
std,252.35884,0.0,0.863321,14.950952,0.708688,0.771712,66.596998
min,2.0,1.0,1.0,0.42,0.0,0.0,0.0
25%,250.75,1.0,1.0,19.0,0.0,0.0,12.475
50%,439.5,1.0,2.0,28.0,0.0,0.0,26.0
75%,651.5,1.0,3.0,36.0,1.0,1.0,57.0
max,890.0,1.0,3.0,80.0,4.0,5.0,512.3292


In [7]:
died.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,549.0,549.0,549.0,424.0,549.0,549.0,549.0
mean,447.016393,0.0,2.531876,30.626179,0.553734,0.32969,22.117887
std,260.640469,0.0,0.735805,14.17211,1.288399,0.823166,31.388207
min,1.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,211.0,0.0,2.0,21.0,0.0,0.0,7.8542
50%,455.0,0.0,3.0,28.0,0.0,0.0,10.5
75%,675.0,0.0,3.0,39.0,1.0,0.0,26.0
max,891.0,0.0,3.0,74.0,8.0,6.0,263.0


In [8]:
train = train.drop(columns=["Cabin", "Ticket", "Name", "PassengerId"])
train["Embarked"] = np.where((train.Embarked == "S"), 1, train.Embarked)
train["Embarked"] = np.where((train.Embarked == "C"), 2, train.Embarked)
train["Embarked"] = np.where((train.Embarked == "Q"), 3, train.Embarked)
train["Sex"] = np.where((train.Sex == "male"), 1, train.Sex)
train["Sex"] = np.where((train.Sex == "female"), 2, train.Sex)

passenger_ids = test["PassengerId"]
test = test.drop(columns=["Cabin", "Name", "Ticket", "PassengerId"])
test["Embarked"] = np.where((test.Embarked == "S"), 1, test.Embarked)
test["Embarked"] = np.where((test.Embarked == "C"), 2, test.Embarked)
test["Embarked"] = np.where((test.Embarked == "Q"), 3, test.Embarked)
test["Sex"] = np.where((test.Sex == "male"), 1, test.Sex)
test["Sex"] = np.where((test.Sex == "female"), 2, test.Sex)

In [9]:
mean_age = np.mean(train["Age"])
mean_fare = np.mean(train["Fare"])
train["Age"] = train["Age"].fillna(mean_age)
train = train.dropna(subset=["Embarked"])
test["Age"] = test["Age"].fillna(round(mean_age))
test["Fare"] = test["Fare"].fillna(mean_fare)

In [10]:
survived = train[train["Survived"] == 1]
died = train[train["Survived"] == 0]

In [11]:
X_train = train.drop(columns=["Survived"])
y_train = train["Survived"]
x_test = test

# Decision tree classifier

In [12]:
tree = DecisionTreeClassifier(max_depth=6, min_samples_leaf=10, criterion="entropy")
tree.fit(X_train, y_train)
train_predict = tree.predict(X_train)
train_errs = np.sum(train_predict != y_train)
conf_mat = confusion_matrix(y_train, train_predict)
classification_err_percent = train_errs / len(y_train) * 100
print(conf_mat)
print(f"Classification error percentage: {classification_err_percent} %")
print(tree.score(X_train, y_train))

[[510  39]
 [ 87 253]]
Classification error percentage: 14.173228346456693 %
0.8582677165354331


In [13]:
predictions = tree.predict(x_test)
passenger_ids = pd.DataFrame(passenger_ids)
passenger_ids["Survived"] = predictions
passenger_ids.to_csv("submission.csv", index=False)

# Naive Bayes

In [14]:
prob_survived = len(survived) / len(train)
prob_died = len(died) / len(train)
n_survived = len(survived)
n_died = len(died)

In [15]:
pclass = {}
sex = {}
embarked = {}

In [16]:
pclass_counts_s = survived["Pclass"].value_counts()
pclass_counts_d = died["Pclass"].value_counts()
sex_counts_s = survived["Sex"].value_counts()
sex_counts_d = died["Sex"].value_counts()
embarked_counts_s = survived["Embarked"].value_counts()
embarked_counts_d = died["Embarked"].value_counts()
sibsp_counts_s = survived["SibSp"].value_counts()
sibsp_counts_d = died["SibSp"].value_counts()
parch_counts_s = survived["Parch"].value_counts()
parch_counts_d = died["Parch"].value_counts()

In [17]:
probabilities = {}
probabilities["pclass_survived"] = {}
probabilities["pclass_died"] = {}
probabilities["embarked_survived"] = {}
probabilities["embarked_died"] = {}
probabilities["sex_survived"] = {}
probabilities["sex_died"] = {}

In [18]:
for i in range(1,4):
    probabilities["pclass_survived"][str(i)] = pclass_counts_s[i] / n_survived
    probabilities["pclass_died"][str(i)] = pclass_counts_d[i] / n_died
    probabilities["embarked_survived"][str(i)] = embarked_counts_s[i] / n_survived
    probabilities["embarked_died"][str(i)] = embarked_counts_d[i] / n_died

In [19]:
for i in range(1,3):
    probabilities["sex_survived"][str(i)] = sex_counts_s[i] / n_survived
    probabilities["sex_died"][str(i)] = sex_counts_d[i] / n_died

Treating the sibsp and parch as categorical

In [20]:
# sibsp_counts_d.sort_index(inplace=True)
# sibsp_counts_s.sort_index(inplace=True)
# parch_counts_s.sort_index(inplace=True)
# parch_counts_d.sort_index(inplace=True)

In [21]:
# probabilities["parch_survived"] = {}
# probabilities["parch_died"] = {}
# probabilities["sibsp_survived"] = {}
# probabilities["sibsp_died"] = {}

# for i in range(len(parch_counts_d)):
#     probabilities["parch_died"][str(parch_counts_d.index[i])] = parch_counts_d[parch_counts_d.index[i]] / n_died

# for i in range(len(parch_counts_s)):
#     probabilities["parch_survived"][str(parch_counts_s.index[4])] = parch_counts_s[parch_counts_s.index[i]] / n_survived

# for i in range(len(sibsp_counts_d)):
#     probabilities["sibsp_died"][str(sibsp_counts_d.index[i])] = sibsp_counts_d[sibsp_counts_d.index[i]] / n_died

# for i in range(len(sibsp_counts_s)):
#     probabilities["sibsp_survived"][str(sibsp_counts_s.index[i])] = sibsp_counts_s[sibsp_counts_s.index[i]] / n_survived


treating the sibsp and parch as numerical

In [22]:
survived_mean_parch = np.mean(survived["Parch"])
died_mean_parch = np.mean(died["Parch"])
survived_std_parch = np.std(survived["Parch"])
died_std_parch = np.std(died["Parch"])

survived_mean_sibsp = np.mean(survived["SibSp"])
died_mean_sibsp = np.mean(died["SibSp"])
survived_std_sibsp = np.std(survived["SibSp"])
died_std_sibsp = np.std(died["SibSp"])

In [23]:
def get_probability(mean, std, x):
    exponent = -(x - mean)**2 / (2*(std**2))
    probability = 1/(math.sqrt(2 * math.pi) * std) * math.e ** exponent
    return probability

In [24]:
survived_mean_age = np.mean(survived["Age"])
died_mean_age = np.mean(died["Age"])
survived_std_age = np.std(survived["Age"])
died_std_age = np.std(died["Age"])

survived_mean_fare = np.mean(survived["Fare"])
died_mean_fare = np.mean(died["Fare"])
survived_std_fare = np.std(survived["Fare"])
died_std_fare = np.std(died["Fare"])

In [25]:
print(probabilities)

{'pclass_survived': {'1': 0.3941176470588235, '2': 0.25588235294117645, '3': 0.35}, 'pclass_died': {'1': 0.14571948998178508, '2': 0.1766848816029144, '3': 0.6775956284153005}, 'embarked_survived': {'1': 0.638235294117647, '2': 0.2735294117647059, '3': 0.08823529411764706}, 'embarked_died': {'1': 0.7777777777777778, '2': 0.1366120218579235, '3': 0.08561020036429873}, 'sex_survived': {'1': 0.3205882352941177, '2': 0.6794117647058824}, 'sex_died': {'1': 0.8524590163934426, '2': 0.14754098360655737}}


In [26]:
nb_predictions = []
for i in range(len(test)):
    pclass = test.loc[i]["Pclass"]
    sex = test.loc[i]["Sex"]
    age = test.loc[i]["Age"]
    sibsp = test.loc[i]["SibSp"]
    fare = test.loc[i]["Fare"]
    parch = test.loc[i]["Parch"]
    embarked = test.loc[i]["Embarked"]

    survived_pclass_prob = probabilities["pclass_survived"][str(pclass)]
    survived_sex_prob = probabilities["sex_survived"][str(sex)]
    survived_embarked_prob = probabilities["embarked_survived"][str(embarked)]
    survived_age_prob = get_probability(survived_mean_age, survived_std_age, age)
    survived_sibsp_prob = get_probability(survived_mean_sibsp, survived_std_sibsp, sibsp)
    survived_fare_prob = get_probability(survived_mean_fare, survived_std_fare, fare)
    survived_parch_prob = get_probability(survived_mean_parch, survived_std_parch, parch)

    died_pclass_prob = probabilities["pclass_died"][str(pclass)]
    died_sex_prob = probabilities["sex_died"][str(sex)]
    died_embarked_prob = probabilities["embarked_died"][str(embarked)]
    died_age_prob = get_probability(died_mean_age, died_std_age, age)
    died_sibsp_prob = get_probability(died_mean_sibsp, died_std_sibsp, sibsp)
    died_fare_prob = get_probability(died_mean_fare, died_std_fare, fare)
    died_parch_prob = get_probability(died_mean_parch, died_std_parch, parch)

    # probability_survived = prob_survived * survived_age_prob * survived_embarked_prob * survived_fare_prob * survived_parch_prob * survived_sibsp_prob * survived_sex_prob * survived_pclass_prob
    probability_survived = prob_survived * survived_embarked_prob * survived_fare_prob * survived_sex_prob * survived_pclass_prob * survived_parch_prob

    probability_died = prob_died * died_embarked_prob * died_fare_prob * died_pclass_prob * died_sex_prob * died_parch_prob

    if probability_survived > probability_died:
        nb_predictions.append(1)
    if probability_died > probability_survived:
        nb_predictions.append(0)


In [27]:
passenger_ids["Survived"] = nb_predictions
passenger_ids.to_csv("submission_nb.csv", index=False)

# Random Forest

In [28]:
rf = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=3)
rf.fit(X_train, y_train)
rf_train_preds = rf.predict(X_train)
rf_predictions = rf.predict(x_test)
rf_train_errs = np.sum(rf_train_preds != y_train)
rf_cm = confusion_matrix(y_train, rf_train_preds)
print(rf_cm)
print(rf.score(X_train, y_train))

[[542   7]
 [ 15 325]]
0.9752530933633295


In [29]:
passenger_ids["Survived"] = rf_predictions
passenger_ids.to_csv("submission_rf.csv", index=False)