In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from sklearn.metrics import mean_squared_error

bj_df = pd.read_csv('Beijing_labeled.csv')
sy_df = pd.read_csv('Shenyang_labeled.csv')
merge = bj_df.append(sy_df)

#shuffle the data
merge = merge.sample(frac=1).reset_index(drop=True)
#drop the null value if existing
merge = merge.dropna()
#check if the data set is balanced
print(merge['PM_HIGH'].sum()/merge.shape[0])
merge.head()

0.27495682210708117


Unnamed: 0,season,DEWP,HUMI,PRES,TEMP,Iws,precipitation,cbwd_NE,cbwd_NW,cbwd_SE,PM_HIGH
0,4,-9.0,43.0,1031.0,2.0,3.56,0.0,0,0,0,0.0
1,1,-17.0,6.0,1020.0,20.0,6.26,0.0,0,0,1,0.0
2,4,-17.0,22.0,1033.0,2.0,8.5,0.0,0,1,0,1.0
3,3,5.0,37.36,1016.0,20.0,22.0,0.0,0,0,0,0.0
4,3,8.0,43.0,1011.0,21.0,3.58,0.0,0,0,1,1.0


In [2]:
#Standardiza and normalize raw data
from sklearn.preprocessing import StandardScaler
features_normalized = ['season', 'DEWP', 'HUMI', 'PRES', 'TEMP', 'Iws', 'precipitation']
merge[features_normalized] = StandardScaler().fit_transform(merge[features_normalized])
merge.head()

Unnamed: 0,season,DEWP,HUMI,PRES,TEMP,Iws,precipitation,cbwd_NE,cbwd_NW,cbwd_SE,PM_HIGH
0,1.345351,-0.715885,0.01788,1.434259,-1.112477,-0.407308,-0.099523,0,0,0,0.0
1,-1.330255,-1.281945,-1.65041,0.343303,0.37744,-0.347334,-0.099523,0,0,1,0.0
2,1.345351,-1.281945,-0.928987,1.632615,-1.112477,-0.297578,-0.099523,0,1,0,1.0
3,0.453482,0.27472,-0.236422,-0.053409,0.37744,0.00229,-0.099523,0,0,0,0.0
4,0.453482,0.486992,0.01788,-0.549298,0.460213,-0.406864,-0.099523,0,0,1,1.0


In [3]:
#split the data set into training and validation data
from sklearn.model_selection import train_test_split
features_all = ['season', 'DEWP', 'HUMI', 'PRES', 'TEMP', 'Iws', 'precipitation', 'cbwd_NE','cbwd_NW','cbwd_SE']
x_train, x_test, y_train, y_test = train_test_split(merge[features_all],merge['PM_HIGH'],test_size=0.25, random_state=0)

In [4]:
from math import sqrt
class knn_classifier(object):
    def __init__(self):
        pass

        
    def get_distance(self, row1, row2):
        distance = 0
        for i in range(len(row1)):
            distance += (row1[i]-row2[i])**2
        return sqrt(distance) 
        
    def get_neighbors(self,x_train,y_train,x_test_row,k_max):
        distance_all = list()
        neighbors = list()
        for i in range(x_train.shape[0]):
            dist = self.get_distance(x_test_row, x_train.iloc[i])
            distance_all.append((dist, y_train.iloc[i]))
        distance_all.sort(key=lambda x:x[0])
        for i in range(k_max):
            neighbors.append(distance_all[i][1])
        return neighbors
    
    def prediction_single(self,x_train,y_train,x_test_row,k_num):
        neighbors = self.get_neighbors(x_train,y_train,x_test_row,k_num)
        prediction = max(set(neighbors),key=neighbors.count)
        return prediction
    
    def prediction_all(self,x_train,y_train,x_test,k_num):
        predictions = list()
        for i in range(x_test.shape[0]):
            pred = self.prediction_single(x_train, y_train, x_test.iloc[i],k_num)
            predictions.append((x_test.iloc[i].name,pred))
        return predictions
    
    def model_accracy(self,y_test,predictions):
        correct = 0
        for i in range(len(y_test)):
            if y_test.iloc[i] == predictions[i][1]:
                correct +=1
                score = correct/len(y_test)
        return score

In [5]:
#find k value with the highest accuracy 
score = list()
for i in range(1,17,2):
    model = knn_classifier()
    predictions = model.prediction_all(x_train, y_train, x_test, i)
    sc = model.model_accracy(y_test,predictions)
    score.append((i,sc))

In [7]:
#when k is 9, we get highest accuracy, thus we will set k value as 9
score.sort(key=lambda x:x[1])
score

[(1, 0.6947513812154696),
 (3, 0.7375690607734806),
 (5, 0.744475138121547),
 (13, 0.7596685082872928),
 (15, 0.761049723756906),
 (7, 0.7638121546961326),
 (11, 0.7665745856353591),
 (9, 0.7734806629834254)]

In [10]:
#evaluate the model using 'Shanghai_labeled.csv'
test_df = pd.read_csv('Shanghai_labeled.csv')
test_df = test_df.dropna()

test_df[features_normalized] = StandardScaler().fit_transform(test_df[features_normalized])
x_train, x_test, y_train, y_test = train_test_split(test_df[features_all],test_df['PM_HIGH'],test_size=0.25, random_state=0)
model = knn_classifier()
predictions = model.prediction_all(x_train, y_train, x_test, 9)
model.model_accracy(y_test,predictions)

0.9023668639053254

In [11]:
#evaluate the model using 'Guangzhou_labeled.csv'
test_df = pd.read_csv('Guangzhou_labeled.csv')
test_df = test_df.dropna()

test_df[features_normalized] = StandardScaler().fit_transform(test_df[features_normalized])
x_train, x_test, y_train, y_test = train_test_split(test_df[features_all],test_df['PM_HIGH'],test_size=0.25, random_state=0)
model = knn_classifier()
predictions = model.prediction_all(x_train, y_train, x_test, 9)
model.model_accracy(y_test,predictions)

0.9319526627218935

In [9]:
#Appendix
#Use KNeighborsClassifier from sklearn as ground truth for our model
from sklearn.neighbors import KNeighborsClassifier
knn_score = []
for i in range(1,17,2):
    knn = KNeighborsClassifier(n_neighbors=i) #class
    #fit the model
    knn.fit(x_train, y_train) #def fit
    #make prediction
    pred = knn.predict(x_test) #def predict
    #model score
    train_score = knn.score(x_train, y_train) #def score
    test_score = knn.score(x_test, y_test)
    #store the score in a list
    knn_score.append((i,train_score,test_score)) 

score_df = pd.DataFrame(knn_score,columns=['k','train score','test score'])
score_df

Unnamed: 0,k,train score,test score
0,1,1.0,0.694751
1,3,0.870567,0.737569
2,5,0.845233,0.744475
3,7,0.833717,0.763812
4,9,0.831875,0.773481
5,11,0.823584,0.766575
6,13,0.815292,0.759669
7,15,0.813911,0.76105
