In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Base NN Model to understand the alogorithm

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#importing train and test data
train_data = pd.read_csv(filepath_or_buffer = "/kaggle/input/titanic/train.csv")
test_data = pd.read_csv(filepath_or_buffer = "/kaggle/input/titanic/test.csv")

In [None]:
#Let's investigate the training data a little bit
train_data.head(5)

In [None]:
# Correlation Matrix for numerical data 
cmatrix=train_data.corr()
cmatrix['Survived'].sort_values(ascending=False)

The results show Fare and Pclass have somewhat correlated. So a potential candidate for modeling

In [None]:
 sns.barplot(data=train_data, x='Sex', y='Survived', )

In [None]:
 sns.barplot(data=train_data, x='Embarked', y='Survived', )

In [None]:
sns.barplot(data=train_data, x='Pclass', y='Survived', )

In [None]:
sns.barplot(x='Sex', y='Survived', hue='Pclass', data=train_data)
plt.ylabel("Survival Rate")
plt.title("Survival as function of Pclass and Sex")
plt.show()

In [None]:
sns.barplot(x='Embarked', y='Survived', hue='Pclass', data=train_data)
plt.ylabel("Survival Rate")
plt.title("Survival as function of Embarked Port")
plt.show()

In [None]:
#Knowing NaNs in the Dataset is very important
print("NaN values in the DataFrame:")
train_data.isna().sum()

In [None]:
#Knowing NaNs in the Dataset is very important
print("NaN values in the DataFrame:")
test_data.isna().sum()

In [None]:
# Conveting categorical variables to numeric to use as model inputs, missing value treatment and mean substitution

from sklearn.preprocessing import LabelEncoder

# creating instance of labelencoder
labelencoder = LabelEncoder()

# Assigning numerical values and storing in another column
train_data['Embarked'] = labelencoder.fit_transform(train_data['Embarked'].fillna('S'))
train_data['Sex'] = labelencoder.fit_transform(train_data['Sex'].fillna('female'))

test_data['Embarked'] = labelencoder.fit_transform(test_data['Embarked'].fillna('S'))
test_data['Sex'] = labelencoder.fit_transform(test_data['Sex'].fillna('female'))

#mean substitute for fare
fare_mean = train_data['Fare'].mean()
test_data['Fare'] = labelencoder.fit_transform(test_data['Fare'].fillna(fare_mean))

In [None]:
#Dropping the fields with low information values w.r.t predictor variable
train_data.drop(labels = ['PassengerId','Name','Ticket', 'Cabin','SibSp','Parch','Age'], axis = 1, inplace = True)
test_data.drop(labels = ['Name','Ticket', 'Cabin','SibSp','Parch','Age'], axis = 1, inplace = True)

In [None]:
train_data.describe()

# Nearest Neighbhor- Base model using two prominent features. Sex and P Class

In [None]:
# define a funtion that calculate the distance and picks the outcome based on the nearest k neibhors 
def predict_nn(df, k=1):
  preds = []
  for i in range(len(df)):
    row = df.iloc[i]
    df_drop = df.drop([i])
    df_drop['dist_sqr'] = (df_drop['Sex'] - row['Sex'])**2 + (df_drop['Pclass'] - row['Pclass'])**2
    nearest_neighbor = df_drop.nsmallest(k, 'dist_sqr')
    prediction = nearest_neighbor.Survived.mode()
    preds.append(prediction)
  return np.array(preds)[:,0]

In [None]:
# define a function to check the accuracy of the model by comparing to ground truth 
def accuracy(predictions, actual):
  return np.sum(predictions == actual)/len(actual)

In [None]:
#prediction using the training data. 5 was arrived using trial and error
predictions_nn = predict_nn(train_data, 5)

In [None]:
#check the accuracy compared to the actual outcome
accuracy(predictions_nn, train_data.Survived)

In [None]:
#modify the funtion to read the learning and test datasets to do the prediction.

def predict_nn(df1,df2, k=1):
  preds = []
  for i in range(len(df2)):
    row = df2.iloc[i]
    # df_drop = df.drop([i])
    df1['dist_sqr'] = (df1['Sex'] - row['Sex'])**2 + (df1['Pclass'] - row['Pclass'])**2
    nearest_neighbor = df1.nsmallest(k, 'dist_sqr')
    prediction = nearest_neighbor.Survived.mode()
    preds.append(prediction)
  return np.array(preds)[:,0]

In [None]:
# Predicting the Test set results
predictions_nn = predict_nn(train_data,test_data, 3)

In [None]:
# Creation of dataframe with prediction results
prediction = pd.DataFrame(data = predictions_nn , columns = ['Survived'])
prediction.insert(0, 'PassengerId', test_data['PassengerId'])
prediction.head(5)

In [None]:
#Creating a submission csv file
prediction.to_csv(path_or_buf = 'submission.csv', sep = ',', index = False, header = True)