In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
#importing train and test data
train_data = pd.read_csv(filepath_or_buffer = "/kaggle/input/titanic/train.csv")
test_data = pd.read_csv(filepath_or_buffer = "/kaggle/input/titanic/test.csv")

In [4]:
#Let's investigate the training data a little bit
train_data.head(5)

In [5]:
# Correlation Matrix for numerical data 
cmatrix=train_data.corr()
cmatrix['Survived'].sort_values(ascending=False)

The results show Fare and Pclass have somewhat correlated. So a potential candidate for modeling

In [None]:
 sns.barplot(data=train_data, x='Sex', y='Survived', )

In [None]:
 sns.barplot(data=train_data, x='Embarked', y='Survived', )

In [None]:
sns.barplot(data=train_data, x='Pclass', y='Survived', )

In [None]:
sns.barplot(x='Sex', y='Survived', hue='Pclass', data=train_data)
plt.ylabel("Survival Rate")
plt.title("Survival as function of Pclass and Sex")
plt.show()

In [None]:
sns.barplot(x='Embarked', y='Survived', hue='Pclass', data=train_data)
plt.ylabel("Survival Rate")
plt.title("Survival as function of Embarked Port")
plt.show()

In [7]:
#Knowing NaNs in the Dataset is very important
print("NaN values in the DataFrame:")
train_data.isna().sum()

In [8]:
#Knowing NaNs in the Dataset is very important
print("NaN values in the DataFrame:")
test_data.isna().sum()

In [6]:
# Conveting categorical variables to numeric to use as model inputs, missing value treatment and mean substitution

from sklearn.preprocessing import LabelEncoder

# creating instance of labelencoder
labelencoder = LabelEncoder()

# Assigning numerical values and storing in another column
train_data['Embarked'] = labelencoder.fit_transform(train_data['Embarked'].fillna('S'))
train_data['Sex'] = labelencoder.fit_transform(train_data['Sex'].fillna('female'))

test_data['Embarked'] = labelencoder.fit_transform(test_data['Embarked'].fillna('S'))
test_data['Sex'] = labelencoder.fit_transform(test_data['Sex'].fillna('female'))

#mean substitute for fare
fare_mean = train_data['Fare'].mean()
test_data['Fare'] = labelencoder.fit_transform(test_data['Fare'].fillna(fare_mean))

In [9]:
#Dropping the fields with low information values w.r.t predictor variable
train_data.drop(labels = ['PassengerId','Name','Ticket', 'Cabin','SibSp','Parch','Age'], axis = 1, inplace = True)
test_data.drop(labels = ['Name','Ticket', 'Cabin','SibSp','Parch','Age'], axis = 1, inplace = True)

In [10]:
train_data.describe()

In [12]:
X_train =train_data.iloc[:,1:5].values
Y_train =train_data.iloc[:,0].values

In [13]:
print (X_train.shape)
print (Y_train.shape)

In [14]:
X_test =test_data.iloc[:,1:5].values
print (X_test.shape)

In [15]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

k_range =range(1,26)
scores =[]
for k in k_range:
    knn=KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, Y_train)
    y_pred = knn.predict(X_train)
    scores.append(metrics.accuracy_score(Y_train, y_pred))
    
# print(scores)

In [17]:
plt.plot(k_range,scores)

In [19]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 4, metric = 'minkowski', p = 2)
classifier.fit(X_train, Y_train)
y_pred = classifier.predict(X_train)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(Y_train, y_pred)
ac = accuracy_score(Y_train, y_pred)

In [20]:
print(cm)

In [21]:
print(ac)

# Titanic Test File scoring

In [22]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
prediction = pd.DataFrame(data = y_pred, columns = ['Survived'])
prediction.insert(0, 'PassengerId', test_data['PassengerId'])
prediction.head(5)

In [23]:
#Creating a submission
prediction.to_csv(path_or_buf = 'submission.csv', sep = ',', index = False, header = True)