In [2]:
import numpy as np
from scipy import stats
import pandas as pd

class KNearestNeighbours:
    def __init__(self,k,regression=True):
        self.k = k
        self.regression = regression

    def fit(self,X_train,Y_train):
        self.X_train = X_train
        self.Y_train = Y_train
        
    def predict(self,X_test):
        y_pred = []
        for i in range(X_test.shape[0]):
            distances = []
            for j in range(self.X_train.shape[0]):
                d =(np.sum(abs(X_test.iloc[i,:] - self.X_train.iloc[j,:])))
                distances.append((d, self.Y_train[j])) 
            distances = sorted(distances)
            
            neighbors = []
            for item in range(self.k):
                neighbors.append(distances[item][1])
            if self.regression:
                y_pred.append(np.mean(neighbors))
            else:
                y_pred.append(stats.mode(neighbors)[0][0])
        return y_pred

In [3]:
df = pd.read_csv('D:/210968186/tested.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [4]:
# Dropping identification columns
df = df.drop(['PassengerId','Name','Cabin','Ticket'],axis=1)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,34.5,0,0,7.8292,Q
1,1,3,female,47.0,1,0,7.0000,S
2,0,2,male,62.0,0,0,9.6875,Q
3,0,3,male,27.0,0,0,8.6625,S
4,1,3,female,22.0,1,1,12.2875,S
...,...,...,...,...,...,...,...,...
413,0,3,male,,0,0,8.0500,S
414,1,1,female,39.0,0,0,108.9000,C
415,0,3,male,38.5,0,0,7.2500,S
416,0,3,male,,0,0,8.0500,S


In [5]:
# Dropping empty rows
df = df.dropna()

In [6]:
# Categorical to numeric
df = df.replace(to_replace={'male':1,'female':-1,'Q':3,'S':2,'C':1})
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,34.5,0,0,7.8292,3
1,1,3,-1,47.0,1,0,7.0000,2
2,0,2,1,62.0,0,0,9.6875,3
3,0,3,1,27.0,0,0,8.6625,2
4,1,3,-1,22.0,1,1,12.2875,2
...,...,...,...,...,...,...,...,...
409,1,3,-1,3.0,1,1,13.7750,2
411,1,1,-1,37.0,1,0,90.0000,3
412,1,3,-1,28.0,0,0,7.7750,2
414,1,1,-1,39.0,0,0,108.9000,1


In [7]:
# Normalizing
df = df/df.max()
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0.0,1.000000,1.0,0.453947,0.000,0.000000,0.015282,1.000000
1,1.0,1.000000,-1.0,0.618421,0.125,0.000000,0.013663,0.666667
2,0.0,0.666667,1.0,0.815789,0.000,0.000000,0.018909,1.000000
3,0.0,1.000000,1.0,0.355263,0.000,0.000000,0.016908,0.666667
4,1.0,1.000000,-1.0,0.289474,0.125,0.166667,0.023984,0.666667
...,...,...,...,...,...,...,...,...
409,1.0,1.000000,-1.0,0.039474,0.125,0.166667,0.026887,0.666667
411,1.0,0.333333,-1.0,0.486842,0.125,0.000000,0.175668,1.000000
412,1.0,1.000000,-1.0,0.368421,0.000,0.000000,0.015176,0.666667
414,1.0,0.333333,-1.0,0.513158,0.000,0.000000,0.212559,0.333333


In [8]:
print("Mean:",df['Fare'].mean(),"Std:",df['Fare'].std())
df['Fare'].loc[np.logical_or(df['Fare'] < df['Fare'].mean() - 2*df['Fare'].std(),df['Fare'] > df['Fare'].mean() + 2*df['Fare'].std())] = df['Fare'].median()
print("Mean:",df['Fare'].mean(),"Std:",df['Fare'].std())

Mean: 0.07999170711171297 Std: 0.11951018646906962
Mean: 0.053115704285424115 Std: 0.055780205363981976


In [9]:
print("Mean:",df['Age'].mean(),"Std:",df['Age'].std())
df['Age'].loc[np.logical_or(df['Age'] < df['Age'].mean() - 2*df['Age'].std(),df['Age'] > df['Age'].mean() + 2*df['Age'].std())] = df['Age'].median()
print("Mean:",df['Age'].mean(),"Std:",df['Age'].std())

Mean: 0.39712195897598945 Std: 0.18558648151054752
Mean: 0.3853553824137381 Std: 0.15059417096543018


In [11]:
# Splitting test train
from sklearn.model_selection import train_test_split

Y = df['Survived']
X = df.drop(['Survived'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [12]:
from sklearn.metrics import mean_squared_error,r2_score
from sklearn import metrics
r2_max = 0
best_model = KNearestNeighbours(0)
for k in range(100,-1,-5):
    model = KNearestNeighbours(k,False)
    model.fit(X_train,y_train)
    Y_pred = model.predict(X_test)
    r2 = metrics.accuracy_score(y_test,Y_pred)*100
    if(r2>r2_max):
        r2_max = r2
        best_model = model
    print('k:',k)
#    print('R2 score:',r2)
#     print('RMSE score',np.sqrt(mean_squared_error(y_test,Y_pred)))
    print("accuracy score:",metrics.accuracy_score(y_test,Y_pred)*100)
    print('\n----')
    print('\n')

  y_pred.append(stats.mode(neighbors)[0][0])


k: 100
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 95
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 90
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 85
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 80
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 75
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 70
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 65
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 60
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 55
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 50
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 45
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 40
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 35
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 30
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 25
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 20
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 15
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 10
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


k: 5
accuracy score: 100.0

----




  y_pred.append(stats.mode(neighbors)[0][0])


IndexError: index 0 is out of bounds for axis 0 with size 0

In [12]:
print('best R2 score:',r2_max)
print('best K:',best_model.k)

[0.8346561363565081,
 0.8360508342728119,
 0.8282682527070588,
 0.8311494188400576,
 0.8289221039823463,
 0.8258444602115848,
 0.8336571307117878,
 0.8365798970097815,
 0.8328142729620525,
 0.8320818112995246,
 0.8290086582187705,
 0.8283029071387019,
 0.8245972323594467,
 0.8229254000783793,
 0.8300186885869046,
 0.8285291049953051,
 0.8253318292545165,
 0.8284569234004987,
 0.8239668288540914,
 0.8266882929192502,
 0.8272986473802506,
 0.8220121623883494,
 0.8331325051114944,
 0.8292802215627448,
 0.8261843065912345,
 0.8250333942361457,
 0.8259975108291601,
 0.8338730006633853,
 0.82622803904421,
 0.8290086582187705,
 0.8284934606950863,
 0.824838353675208,
 0.8280572948065923,
 0.840657335908206,
 0.8347741986860021,
 0.8302712811792862,
 0.8265937155763678,
 0.8239039419852157,
 0.8249300077032609,
 0.8285557326827476,
 0.8236845880774223,
 0.8247953555057427,
 0.8258546486083517,
 0.8227295090098244,
 0.8305798679724471,
 0.8280956251271122,
 0.8267344025886881,
 0.82708479625595

0.8968253968253969

0.8333333333333334