In [199]:
# Python Version 3.10.6
import numpy as np # Numpy 1.24.2
import pandas as pd # Pandas 1.5.3
from collections import Counter # 
import matplotlib.pyplot as plot # Matplotlib 3.7.0

In [200]:
# Import and reorder CSV file
# Row 0 was skipped because row 0 was the row label

df = pd.read_csv('HappinessData-1.csv', skiprows=[0], names=['Unhappy/Happy', 'City Services Availability', 'Housing Cost', 
                                                            'Quality of schools', 'Community trust in local police', 
                                                            'Community Maintenance', 'Availability of Community Room '])
df


Unnamed: 0,Unhappy/Happy,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room
0,1,5,3,3.0,3.0,5,3
1,0,5,3,3.0,3.0,5,3
2,0,4,3,3.0,3.0,4,4
3,0,4,1,3.0,4.0,4,5
4,1,3,2,4.0,4.0,4,5
...,...,...,...,...,...,...,...
135,1,4,3,3.0,3.0,3,4
136,0,3,4,3.0,3.0,2,3
137,1,3,3,3.0,5.0,5,5
138,1,3,3,1.0,3.0,3,4


In [201]:
# Process data for any NaN
# First locate all NaN in dataset
nan_row = df[df.isnull().any(axis=1)]
nan_row

Unnamed: 0,Unhappy/Happy,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room
5,1,3,2,4.0,,4,5
18,1,5,3,4.0,,4,5
27,1,4,3,,4.0,3,4
31,1,5,3,,5.0,4,5


In [202]:
# Calculate Mean and Median of each column
# Analyze data and figure out best value to fill NaN
print(df.mean(numeric_only=True))
print("\n")
print(df.median(numeric_only=True))

Unhappy/Happy                      0.535714
City Services Availability         4.321429
Housing Cost                       2.542857
Quality of schools                 3.253623
Community trust in local police    3.695652
Community Maintenance              3.600000
Availability of Community Room     4.221429
dtype: float64


Unhappy/Happy                      1.0
City Services Availability         5.0
Housing Cost                       3.0
Quality of schools                 3.0
Community trust in local police    4.0
Community Maintenance              4.0
Availability of Community Room     4.0
dtype: float64


In [203]:
# Looking at the data, Mean, Median and Mode are all close enough to each other
# Replace NaN with Median, because unlike Mean (Median is a whole number and is based on frequency)

df = df.fillna(df.median())

# Check for nan rows 1 more time
nan_row2 = df[df.isnull().any(axis=1)]
nan_row2

Unnamed: 0,Unhappy/Happy,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room


In [204]:
# Remove Duplicate rows in the data
# First find all of the duplicated rows

duplicateRows = df[df.duplicated()]
duplicateRows

Unnamed: 0,Unhappy/Happy,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room
5,1,3,2,4.0,4.0,4,5
9,1,5,3,4.0,3.0,4,5
23,1,5,1,3.0,3.0,4,4
24,1,5,1,3.0,3.0,4,4
26,1,5,2,4.0,3.0,4,5
36,1,5,2,4.0,4.0,5,5
39,1,4,1,3.0,4.0,4,4
45,1,4,3,3.0,4.0,3,4
64,0,4,1,3.0,4.0,4,3
75,1,5,5,5.0,5.0,5,5


In [205]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,Unhappy/Happy,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room
0,1,5,3,3.0,3.0,5,3
1,0,5,3,3.0,3.0,5,3
2,0,4,3,3.0,3.0,4,4
3,0,4,1,3.0,4.0,4,5
4,1,3,2,4.0,4.0,4,5
...,...,...,...,...,...,...,...
134,1,5,1,2.0,5.0,2,4
135,1,4,3,3.0,3.0,3,4
136,0,3,4,3.0,3.0,2,3
137,1,3,3,3.0,5.0,5,5


In [206]:
# Unhappy/Happy is the class label: as per specifications, move the class label to the last column
# Save reference to column
class_label = df['Unhappy/Happy']
class_label

0      1
1      0
2      0
3      0
4      1
      ..
134    1
135    1
136    0
137    1
138    1
Name: Unhappy/Happy, Length: 124, dtype: int64

In [207]:
# drop class label column
df.drop('Unhappy/Happy', axis=1, inplace=True)
df

Unnamed: 0,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room
0,5,3,3.0,3.0,5,3
1,5,3,3.0,3.0,5,3
2,4,3,3.0,3.0,4,4
3,4,1,3.0,4.0,4,5
4,3,2,4.0,4.0,4,5
...,...,...,...,...,...,...
134,5,1,2.0,5.0,2,4
135,4,3,3.0,3.0,3,4
136,3,4,3.0,3.0,2,3
137,3,3,3.0,5.0,5,5


In [208]:
# add new column to end of table
df['Unhappy/Happy'] = class_label
df

Unnamed: 0,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room,Unhappy/Happy
0,5,3,3.0,3.0,5,3,1
1,5,3,3.0,3.0,5,3,0
2,4,3,3.0,3.0,4,4,0
3,4,1,3.0,4.0,4,5,0
4,3,2,4.0,4.0,4,5,1
...,...,...,...,...,...,...,...
134,5,1,2.0,5.0,2,4,1
135,4,3,3.0,3.0,3,4,1
136,3,4,3.0,3.0,2,3,0
137,3,3,3.0,5.0,5,5,1


In [209]:
# Split the dataframe to training set and dataset.
# Using an 80% - 20% split

df_train = df.sample(frac=0.8)
df_test = df.drop(df_train.index)

In [210]:
# reindex set, then print
# df_train.sort_index(ascending=True, inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_train

Unnamed: 0,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room,Unhappy/Happy
0,5,2,4.0,5.0,4,5,1
1,5,2,3.0,3.0,3,5,0
2,3,2,4.0,3.0,4,4,1
3,5,3,4.0,4.0,4,4,1
4,4,4,4.0,4.0,3,4,0
...,...,...,...,...,...,...,...
94,3,4,3.0,4.0,1,4,0
95,5,1,2.0,5.0,2,4,1
96,5,3,4.0,5.0,4,5,0
97,4,2,4.0,4.0,4,4,1


In [211]:
# reindex the testset
df_test.reset_index(drop=True, inplace=True)
df_test

Unnamed: 0,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room,Unhappy/Happy
0,5,3,3.0,3.0,5,3,1
1,5,3,4.0,3.0,4,5,1
2,4,4,3.0,4.0,2,4,1
3,5,2,3.0,3.0,3,3,0
4,4,3,3.0,4.0,2,4,0
5,5,3,4.0,4.0,4,5,1
6,5,2,4.0,3.0,4,5,1
7,5,1,1.0,5.0,3,5,0
8,4,4,3.0,3.0,2,5,1
9,4,3,3.0,3.0,3,5,0


In [212]:
# Proccess the data such that relevent features are kept and irrelevent features are dropped
# Create a correlation matrix for the Training set
correlation_matrix = df_train.corr()
correlation_matrix

Unnamed: 0,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room,Unhappy/Happy
City Services Availability,1.0,0.090621,0.263066,0.128817,0.395234,0.525909,0.351324
Housing Cost,0.090621,1.0,0.207764,0.141135,-0.019336,0.046668,-0.050241
Quality of schools,0.263066,0.207764,1.0,0.213925,0.270573,0.068438,0.193076
Community trust in local police,0.128817,0.141135,0.213925,1.0,0.311028,0.106929,0.175564
Community Maintenance,0.395234,-0.019336,0.270573,0.311028,1.0,0.373679,0.147842
Availability of Community Room,0.525909,0.046668,0.068438,0.106929,0.373679,1.0,0.231609
Unhappy/Happy,0.351324,-0.050241,0.193076,0.175564,0.147842,0.231609,1.0


In [411]:
class KNN:
    def __init__(self, k):
        self.k = k
        
    def fit(self, X_train, Y_train):
        self.x_train = X_train
        self.y_train = Y_train
        
    # Calculates the distance between two points
    def euclidian_distance(self, x_test_point, x_training_point):
        distance = 0.0
        
        for i in range(len(x_test_point)):
            distance += (x_test_point[i]-x_training_point[i])**2
            
        return np.sqrt(distance)
        
    def predict(self, x_test):
        #self.euclidian_distance(x_test.iloc[], self.x_train.iloc[0])
        #predictions = [self._predict(x_test.loc[i] for i in range(len(x_test)), )]
        
        #self._predict(x_test.loc[i])
        
        distance = []
        
        print(len(x_test))
        print(len(self.x_train))
        
        for i in range(len(x_test)):
            for j in range(len(self.x_train)):
                distance.append(self.euclidian_distance(x_test.loc[i], x_train.loc[j]))
        
        df_dist = pd.DataFrame(data=distance)
        return df_dist

        # Find nearest k
        

In [412]:
# For training set, seperate x and y values

x_train = df_train.drop('Unhappy/Happy', axis=1)
y_train = df_train["Unhappy/Happy"]

In [413]:
# Do same for test set

x_test = df_test.drop('Unhappy/Happy', axis=1)
y_test = df_test["Unhappy/Happy"]

In [414]:
# Make nearest_neighbors class object
nearest_neighbors = KNN(k=5)
nearest_neighbors.fit(x_train, y_train)

a = nearest_neighbors.euclidian_distance(x_train.iloc[0], x_train.iloc[1])

print(a)

2.449489742783178


In [415]:
b = nearest_neighbors.predict(x_test)
b

25
99


Unnamed: 0,0
0,3.316625
1,3.000000
2,2.828427
3,2.000000
4,3.000000
...,...
2470,3.162278
2471,3.741657
2472,4.358899
2473,3.605551
