In [47]:
# Python Version 3.10.6
import numpy as np # Numpy 1.24.2
import pandas as pd # Pandas 1.5.3
from collections import Counter # 
import matplotlib.pyplot as plot # Matplotlib 3.7.0

In [48]:
# Import and reorder CSV file
# Row 0 was skipped because row 0 was the row label

df = pd.read_csv('HappinessData-1.csv', skiprows=[0], names=['Unhappy/Happy', 'City Services Availability', 'Housing Cost', 
                                                            'Quality of schools', 'Community trust in local police', 
                                                            'Community Maintenance', 'Availability of Community Room '])
df


Unnamed: 0,Unhappy/Happy,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room
0,1,5,3,3.0,3.0,5,3
1,0,5,3,3.0,3.0,5,3
2,0,4,3,3.0,3.0,4,4
3,0,4,1,3.0,4.0,4,5
4,1,3,2,4.0,4.0,4,5
...,...,...,...,...,...,...,...
135,1,4,3,3.0,3.0,3,4
136,0,3,4,3.0,3.0,2,3
137,1,3,3,3.0,5.0,5,5
138,1,3,3,1.0,3.0,3,4


In [49]:
# Process data for any NaN
# First locate all NaN in dataset
nan_row = df[df.isnull().any(axis=1)]
nan_row

Unnamed: 0,Unhappy/Happy,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room
5,1,3,2,4.0,,4,5
18,1,5,3,4.0,,4,5
27,1,4,3,,4.0,3,4
31,1,5,3,,5.0,4,5


In [50]:
# Calculate Mean and Median of each column
# Analyze data and figure out best value to fill NaN
print(df.mean(numeric_only=True))
print("\n")
print(df.median(numeric_only=True))

Unhappy/Happy                      0.535714
City Services Availability         4.321429
Housing Cost                       2.542857
Quality of schools                 3.253623
Community trust in local police    3.695652
Community Maintenance              3.600000
Availability of Community Room     4.221429
dtype: float64


Unhappy/Happy                      1.0
City Services Availability         5.0
Housing Cost                       3.0
Quality of schools                 3.0
Community trust in local police    4.0
Community Maintenance              4.0
Availability of Community Room     4.0
dtype: float64


In [51]:
# Looking at the data, Mean, Median and Mode are all close enough to each other
# Replace NaN with Median, because unlike Mean (Median is a whole number and is based on frequency)

df = df.fillna(df.median())

# Check for nan rows 1 more time
nan_row2 = df[df.isnull().any(axis=1)]
nan_row2

Unnamed: 0,Unhappy/Happy,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room


In [52]:
# Remove Duplicate rows in the data
# First find all of the duplicated rows

duplicateRows = df[df.duplicated()]
duplicateRows

Unnamed: 0,Unhappy/Happy,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room
5,1,3,2,4.0,4.0,4,5
9,1,5,3,4.0,3.0,4,5
23,1,5,1,3.0,3.0,4,4
24,1,5,1,3.0,3.0,4,4
26,1,5,2,4.0,3.0,4,5
36,1,5,2,4.0,4.0,5,5
39,1,4,1,3.0,4.0,4,4
45,1,4,3,3.0,4.0,3,4
64,0,4,1,3.0,4.0,4,3
75,1,5,5,5.0,5.0,5,5


In [55]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,Unhappy/Happy,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room
0,1,5,3,3.0,3.0,5,3
1,0,5,3,3.0,3.0,5,3
2,0,4,3,3.0,3.0,4,4
3,0,4,1,3.0,4.0,4,5
4,1,3,2,4.0,4.0,4,5
...,...,...,...,...,...,...,...
134,1,5,1,2.0,5.0,2,4
135,1,4,3,3.0,3.0,3,4
136,0,3,4,3.0,3.0,2,3
137,1,3,3,3.0,5.0,5,5


In [56]:
# Unhappy/Happy is the class label: as per specifications, move the class label to the last column
# Save reference to column
class_label = df['Unhappy/Happy']
class_label

0      1
1      0
2      0
3      0
4      1
      ..
134    1
135    1
136    0
137    1
138    1
Name: Unhappy/Happy, Length: 124, dtype: int64

In [57]:
# drop class label column
df.drop('Unhappy/Happy', axis=1, inplace=True)
df

Unnamed: 0,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room
0,5,3,3.0,3.0,5,3
1,5,3,3.0,3.0,5,3
2,4,3,3.0,3.0,4,4
3,4,1,3.0,4.0,4,5
4,3,2,4.0,4.0,4,5
...,...,...,...,...,...,...
134,5,1,2.0,5.0,2,4
135,4,3,3.0,3.0,3,4
136,3,4,3.0,3.0,2,3
137,3,3,3.0,5.0,5,5


In [58]:
# add new column to end of table
df['Unhappy/Happy'] = class_label
df

Unnamed: 0,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room,Unhappy/Happy
0,5,3,3.0,3.0,5,3,1
1,5,3,3.0,3.0,5,3,0
2,4,3,3.0,3.0,4,4,0
3,4,1,3.0,4.0,4,5,0
4,3,2,4.0,4.0,4,5,1
...,...,...,...,...,...,...,...
134,5,1,2.0,5.0,2,4,1
135,4,3,3.0,3.0,3,4,1
136,3,4,3.0,3.0,2,3,0
137,3,3,3.0,5.0,5,5,1


In [59]:
# Split the dataframe to training set and dataset.
# Using an 80% - 20% split

df_train = df.sample(frac=0.8)
df_test = df.drop(df_train.index)

In [60]:
# reindex set, then print
# df_train.sort_index(ascending=True, inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_train

Unnamed: 0,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room,Unhappy/Happy
0,5,3,3.0,5.0,4,5,1
1,5,3,3.0,3.0,5,3,0
2,4,3,3.0,4.0,3,4,1
3,5,3,2.0,5.0,5,5,0
4,5,3,3.0,3.0,3,5,1
...,...,...,...,...,...,...,...
94,5,1,1.0,5.0,3,5,0
95,5,1,5.0,5.0,5,5,1
96,4,1,3.0,4.0,4,3,0
97,4,2,3.0,3.0,4,4,0


In [61]:
# reindex the testset
df_test.reset_index(drop=True, inplace=True)
df_test

Unnamed: 0,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room,Unhappy/Happy
0,4,3,3.0,3.0,4,4,0
1,4,1,3.0,4.0,4,5,0
2,3,2,4.0,4.0,4,5,1
3,4,4,3.0,4.0,2,4,1
4,5,2,4.0,3.0,4,5,1
5,5,2,4.0,4.0,5,5,1
6,5,2,3.0,3.0,2,5,0
7,3,2,3.0,3.0,4,4,0
8,4,2,4.0,4.0,4,4,1
9,5,3,2.0,4.0,4,4,1


In [62]:
# Proccess the data such that relevent features are kept and irrelevent features are dropped
# Create a correlation matrix for the Training set
correlation_matrix = df_train.corr()
correlation_matrix

Unnamed: 0,City Services Availability,Housing Cost,Quality of schools,Community trust in local police,Community Maintenance,Availability of Community Room,Unhappy/Happy
City Services Availability,1.0,0.079741,0.281873,0.126753,0.516276,0.422578,0.308292
Housing Cost,0.079741,1.0,0.223599,0.103405,0.040754,-0.005283,0.001606
Quality of schools,0.281873,0.223599,1.0,0.237201,0.307723,0.152546,0.165746
Community trust in local police,0.126753,0.103405,0.237201,1.0,0.27623,0.194743,0.113974
Community Maintenance,0.516276,0.040754,0.307723,0.27623,1.0,0.298581,0.181962
Availability of Community Room,0.422578,-0.005283,0.152546,0.194743,0.298581,1.0,0.169342
Unhappy/Happy,0.308292,0.001606,0.165746,0.113974,0.181962,0.169342,1.0


In [63]:
# Calculates the distance between two points
def euclidian_distance(x1, x2):
    distance = np.sqrt(np.sum((x1-x2)**2))
    return distance

In [64]:
class KNN:
    def __init__(self, k):
        self.k = k
        
    def fit(self, X_train, Y_train):
        self.x = X_train
        self.y = Y_train
        
    def predict(self, X_Test):
        predictions = [self._predict(x) for x in X]
        return predictions
    
    def _predict(self, x):
        # Compute Distance
        distances = [euclidian_distance(x, x_train) for x_train in self.X_train]
        
        # Closest K
        np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        
        # majority vote
        most_common = Counter(k_nearest_labels).most_common()
        return most_common

In [65]:
nearest_neighbor = KNN(k=5)
nearest_neighbor.fit()

TypeError: KNN.fit() missing 2 required positional arguments: 'X_train' and 'Y_train'