In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv


In [2]:
# importing libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [3]:
# importing datasets
train_dataset = pd.read_csv("/kaggle/input/titanic/train.csv")
test_dataset = pd.read_csv("/kaggle/input/titanic/test.csv")
y_test_dataset = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")


In [4]:
# creating train/test and x/y splits
x_train = train_dataset.iloc[:, [4, 6, 7]].values
y_train = train_dataset.iloc[:, 1].values
x_test = test_dataset.iloc[:, [3, 5, 6]].values
y_test = y_test_dataset.iloc[:, 1].values

In [5]:
# labeling genders
le1 = LabelEncoder()
x_train[:, 0] = le1.fit_transform(x_train[:, 0])
x_test[:, 0] = le1.transform(x_test[:, 0])

In [6]:
# dealing with nan data
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
x_train_imp = imp.fit_transform(x_train)
x_test_imp = imp.fit_transform(x_test)

In [7]:
# feature scaling
sc = StandardScaler()
x_train_imp = sc.fit_transform(x_train_imp)
x_test_imp = sc.fit_transform(x_test_imp)

In [8]:
# ----------------------------- RANDOM FOREST CLASSIFIER ---------------
# training using entropy criterion
rfc = RandomForestClassifier(n_estimators=100, criterion='entropy',max_depth=5, random_state=1, n_jobs=-1)
rfc.fit(x_train, y_train)

# predicting results
y_pred_rfc = rfc.predict(x_test)

# visualising confusion matrix & computing accuracy score
print('\nRandom Forest Classifier (entropy) results:')
print('confusion matrix = \n' + str(confusion_matrix(y_test, y_pred_rfc)))
print('accuracy = ' + str(accuracy_score(y_test, y_pred_rfc)))

# training using gini criterion
rfc2 = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=5, random_state=1, n_jobs=-1)
rfc2.fit(x_train_imp, y_train)

# predicting results
y_pred_rfc2 = rfc2.predict(x_test_imp)

# visualising confusion matrix & computing accuracy score
print('\nRandom Forest Classifier (gini) results:')
print('confusion matrix = \n' + str(confusion_matrix(y_test, y_pred_rfc2)))
print('accuracy = ' + str(accuracy_score(y_test, y_pred_rfc2)))


Random Forest Classifier (entropy) results:
confusion matrix = 
[[266   0]
 [  7 145]]
accuracy = 0.9832535885167464

Random Forest Classifier (gini) results:
confusion matrix = 
[[266   0]
 [  5 147]]
accuracy = 0.9880382775119617


In [9]:
# ----------------------------- DECISION TREE CLASSIFIER ---------------
# training using entropy criterion
dtc = DecisionTreeClassifier(criterion = 'entropy')
dtc.fit(x_train_imp, y_train)

# predicting results
y_pred_dtc = dtc.predict(x_test_imp)

# visualising confusion matrix & computing accuracy score
print('\nDecision Tree Classifier (entropy) results:')
print('confusion matrix = \n' + str(confusion_matrix(y_test, y_pred_dtc)))
print('accuracy = ' + str(accuracy_score(y_test, y_pred_dtc)))

# training using gini criterion
dtc2 = DecisionTreeClassifier(criterion = 'gini')
dtc2.fit(x_train_imp, y_train)

# predicting results
y_pred_dtc2 = dtc2.predict(x_test_imp)

# visualising confusion matrix & computing accuracy score
print('\nDecision Tree Classifier (gini) results:')
print('confusion matrix = \n' + str(confusion_matrix(y_test, y_pred_dtc2)))
print('accuracy = ' + str(accuracy_score(y_test, y_pred_dtc2)))


Decision Tree Classifier (entropy) results:
confusion matrix = 
[[261   5]
 [  7 145]]
accuracy = 0.9712918660287081

Decision Tree Classifier (gini) results:
confusion matrix = 
[[261   5]
 [  7 145]]
accuracy = 0.9712918660287081


In [10]:
# ----------------------------- K-NN CLASSIFIER ---------------
# training
distance=['manhattan','cosine','euclidean','minkowski']
for j in range(4):
  print(distance[j])
  for i in range(5,16):
    #Setup a knn classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=i,metric=distance[j])
    
    #Fit the model
    knn.fit(x_train_imp, y_train)
    # Predicting the test set results
    y_pred_knn = knn.predict(x_test_imp)
    print('accuracy of ' + distance[j] +' for '+ str(i) +' neighbor = ' + str(accuracy_score(y_test, y_pred_knn)))
    

#highest accuracy was of manhanton distance at the neighbor count of 10
knn = KNeighborsClassifier(n_neighbors=10,metric='manhattan')
knn.fit(x_train_imp, y_train)
y_pred_knn = knn.predict(x_test_imp)


# visualising confusion matrix & computing accuracy score
print('\nK-Nearest Neighbours results:')
print('confusion matrix = \n' + str(confusion_matrix(y_test, y_pred_knn)))
print('accuracy = ' + str(accuracy_score(y_test, y_pred_knn)))

manhattan
accuracy of manhattan for 5 neighbor = 0.8516746411483254
accuracy of manhattan for 6 neighbor = 0.8349282296650717
accuracy of manhattan for 7 neighbor = 0.8588516746411483
accuracy of manhattan for 8 neighbor = 0.9712918660287081
accuracy of manhattan for 9 neighbor = 0.9688995215311005
accuracy of manhattan for 10 neighbor = 0.9832535885167464
accuracy of manhattan for 11 neighbor = 0.9688995215311005
accuracy of manhattan for 12 neighbor = 0.9808612440191388
accuracy of manhattan for 13 neighbor = 0.9401913875598086
accuracy of manhattan for 14 neighbor = 0.9784688995215312
accuracy of manhattan for 15 neighbor = 0.9784688995215312
cosine
accuracy of cosine for 5 neighbor = 0.9569377990430622
accuracy of cosine for 6 neighbor = 0.9641148325358851
accuracy of cosine for 7 neighbor = 0.9641148325358851
accuracy of cosine for 8 neighbor = 0.9688995215311005
accuracy of cosine for 9 neighbor = 0.9593301435406698
accuracy of cosine for 10 neighbor = 0.9688995215311005
accuracy

In [11]:
# ----------------------------- EXPORTING BEST RESULT ---------------
#best result was produced by RANDOM FOREST CLASSIFIER at criterion='entropy'

output = pd.DataFrame({'PassengerId': test_dataset.PassengerId, 'Survived': y_pred_rfc2 })
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")
submition = pd.read_csv("my_submission.csv")
submition.head()

#result = []
#for i in range(892, 1310):
#    row = [i, y_pred_knn[i - 892]]
#    result.append(row)
#import csv
#with open('titanic_result_knn.csv', "w+") as csv_file:
#    csvWriter = csv.writer(csv_file, delimiter = ',')
#    csvWriter.writerows(result)

Your submission was successfully saved!


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
