### Import modules

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import tensorflow as tf

### Import data

In [36]:
lst = []
for file in os.listdir('data'):
    temp = pd.read_csv(os.path.join('data',file))
    lst.append(temp)
df = pd.concat(lst)
df.head()

Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,solarenergy,uvindex,severerisk,sunrise,sunset,moonphase,conditions,description,icon,stations
0,ha noi,2021-03-01,23.0,18.0,20.6,23.0,18.0,20.6,19.7,95.1,...,20.3,8,,2021-03-01T06:17:05,2021-03-01T18:01:04,0.57,"Rain, Overcast",Cloudy skies throughout the day with rain.,rain,"48820099999,48823099999,48825099999,4883109999..."
1,ha noi,2021-03-02,21.0,18.7,19.9,21.0,18.7,19.9,12.3,62.6,...,19.0,9,,2021-03-02T06:16:17,2021-03-02T18:01:28,0.6,"Rain, Partially cloudy",Partly cloudy throughout the day with a chance...,rain,"48820099999,48823099999,48825099999,4883109999..."
2,ha noi,2021-03-03,19.0,16.8,18.1,19.0,16.8,18.1,14.3,79.1,...,12.1,5,,2021-03-03T06:15:29,2021-03-03T18:01:51,0.64,"Rain, Overcast",Cloudy skies throughout the day with afternoon...,rain,"48820099999,48823099999,48825099999,4883109999..."
3,ha noi,2021-03-04,18.8,16.0,17.1,18.8,16.0,17.1,15.4,89.9,...,11.7,5,,2021-03-04T06:14:40,2021-03-04T18:02:14,0.67,"Rain, Overcast",Cloudy skies throughout the day with rain clea...,rain,"48820099999,48823099999,48825099999,4883109999..."
4,ha noi,2021-03-05,20.7,17.0,19.2,20.7,17.0,19.2,18.2,93.8,...,20.1,8,,2021-03-05T06:13:51,2021-03-05T18:02:36,0.71,"Rain, Overcast",Cloudy skies throughout the day with early mor...,rain,"48820099999,48823099999,48825099999,4883109999..."


### Preprocess data

In [32]:
features = ["temp", "dew", "humidity", "precip", "precipcover", "windspeed", "cloudcover", "visibility", "uvindex"]
df = df[features]
df.head()

Unnamed: 0,temp,dew,humidity,precip,precipcover,windspeed,cloudcover,visibility,uvindex
0,20.6,19.7,95.1,0.707,8.33,13.0,94.5,3.3,8
1,19.9,12.3,62.6,0.987,12.5,24.1,86.6,9.8,9
2,18.1,14.3,79.1,0.002,4.17,14.8,92.0,7.0,5
3,17.1,15.4,89.9,4.0,8.33,11.2,91.3,5.8,5
4,19.2,18.2,93.8,0.686,4.17,11.2,95.7,2.5,8


In [33]:
def preprocess(df): 
    # MinMax normalize
    for feature in features:
        df[feature] = (df[feature] - df[feature].min()) / (df[feature].max() - df[feature].min())
    return df

In [34]:
normalized_df = preprocess(df)
normalized_df.head()

Unnamed: 0,temp,dew,humidity,precip,precipcover,windspeed,cloudcover,visibility,uvindex
0,0.441065,0.712329,0.943953,0.004655,0.090869,0.188552,0.945,0.058394,0.8
1,0.414449,0.458904,0.464602,0.006498,0.136359,0.56229,0.866,0.532847,0.9
2,0.346008,0.527397,0.707965,1.3e-05,0.045489,0.249158,0.92,0.328467,0.5
3,0.307985,0.565068,0.867257,0.026336,0.090869,0.127946,0.913,0.240876,0.5
4,0.387833,0.660959,0.924779,0.004517,0.045489,0.127946,0.957,0.0,0.8


If conditions are the desired outputs, run the following cell

In [40]:
conditions_set = set()
for s in df["conditions"]:
    conditions_set.add(s)
conditions_set = sorted(conditions_set)

labels_list = np.arange(len(conditions_set))

def set_label(x):
    if x == 'Clear': return 1
    elif x == 'Overcast': return 2
    elif x == 'Rain': return 3
    elif x == 'Rain, Overcast': return 4
    else: return 5

func = np.vectorize(set_label)
y = func(df["conditions"])

print(df['conditions'].head())
print(y[:5])

0            Rain, Overcast
1    Rain, Partially cloudy
2            Rain, Overcast
3            Rain, Overcast
4            Rain, Overcast
Name: conditions, dtype: object
[4 5 4 4 4]


In [42]:
X = normalized_df.to_numpy()

Spit train, test data:

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Train data on kNN model

In [51]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [60]:
for i in range (2, 10):
    knn = KNeighborsClassifier(n_neighbors=i, metric='euclidean')
    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)
    print("With k = {}, accuracy score: {}".format(i, accuracy_score(y_test, y_pred)))

With k = 2, accuracy score: 0.8590604026845637
With k = 3, accuracy score: 0.8791946308724832
With k = 4, accuracy score: 0.8926174496644296
With k = 5, accuracy score: 0.8993288590604027
With k = 6, accuracy score: 0.8926174496644296
With k = 7, accuracy score: 0.8926174496644296
With k = 8, accuracy score: 0.8859060402684564
With k = 9, accuracy score: 0.8657718120805369
