# Project - $k$-Nearest-Neighbors Classifier
- Create a $k$-Nearest-Neighbors Classifier supporting 3 dimensions
- Investigate whether it performs better

### Step 1: Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

### Step 2: Read data
- Use pandas [read_csv](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) method to read **files/weather.csv**
- HINT: Use **parse_dates=True** and **index_col=0**

In [2]:
data = pd.read_csv('files/weather.csv', parse_dates=True, index_col=0)
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3337 entries, 2008-02-01 to 2017-06-25
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MinTemp        3334 non-null   float64
 1   MaxTemp        3335 non-null   float64
 2   Rainfall       3331 non-null   float64
 3   Evaporation    3286 non-null   float64
 4   Sunshine       3321 non-null   float64
 5   WindGustDir    2301 non-null   object 
 6   WindGustSpeed  2301 non-null   float64
 7   WindDir9am     3281 non-null   object 
 8   WindDir3pm     3304 non-null   object 
 9   WindSpeed9am   3311 non-null   float64
 10  WindSpeed3pm   3312 non-null   float64
 11  Humidity9am    3323 non-null   float64
 12  Humidity3pm    3324 non-null   float64
 13  Pressure9am    3317 non-null   float64
 14  Pressure3pm    3318 non-null   float64
 15  Cloud9am       2771 non-null   float64
 16  Cloud3pm       2776 non-null   float64
 17  Temp9am        3333 non-null   flo

### Step 3: Investigate data types
- Use [dtypes](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dtypes.html)
- The goal is to identify all columns with datatype **float64** for next step

In [3]:
# info에서 확인했으므로 pass

### Step 4: Choose 3 columns to create datasets
- Use **Humidity3pm** and **Pressure3pm** together with another column to predict **RainTomorrow**
- Make a list of three column names **'Humidity3pm', 'Pressure3pm', INSERT YOUR CHOICE** (should be one with dtype *float64*, e.g., **Cloud3pm**), and **'RainTomorrow'**
- Create the dataset consisting of these 4 columns

In [4]:
columns = ['Humidity3pm', 'Pressure3pm', 'Evaporation', 'RainTomorrow']
dataset = data[columns]

In [5]:
dataset

Unnamed: 0_level_0,Humidity3pm,Pressure3pm,Evaporation,RainTomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2008-02-01,84.0,1017.4,6.2,Yes
2008-02-02,73.0,1016.4,3.4,Yes
2008-02-03,86.0,1015.6,2.4,Yes
2008-02-04,90.0,1011.8,2.2,Yes
2008-02-05,74.0,1004.8,,Yes
...,...,...,...,...
2017-06-21,52.0,1025.3,2.0,No
2017-06-22,53.0,1024.6,2.0,No
2017-06-23,56.0,1015.0,2.4,No
2017-06-24,35.0,1015.1,1.4,No


### Step 5: Deal with remaining missing data
- A simple choice is to simply remove rows with missing data
- Use [dropna()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html)

In [6]:
Dataset = dataset.copy()
Dataset.dropna(axis=0, inplace=True)

In [7]:
Dataset

Unnamed: 0_level_0,Humidity3pm,Pressure3pm,Evaporation,RainTomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2008-02-01,84.0,1017.4,6.2,Yes
2008-02-02,73.0,1016.4,3.4,Yes
2008-02-03,86.0,1015.6,2.4,Yes
2008-02-04,90.0,1011.8,2.2,Yes
2008-02-06,62.0,998.6,2.6,Yes
...,...,...,...,...
2017-06-21,52.0,1025.3,2.0,No
2017-06-22,53.0,1024.6,2.0,No
2017-06-23,56.0,1015.0,2.4,No
2017-06-24,35.0,1015.1,1.4,No


### Step 6: Create training and test datasets
- Define dataset **X** to be the data consisting of the three columns.
- Define dataset **y** to be datset cosisting of **'RainTomorrow'**.
    - HINT: Use list comprehension to transform **'No'** and **'Yes'** to 0 and 1, repectively (like in the Lesson)
- Divide into **X_train, X_test, y_train, y_test** with **train_test_split**
    - HINT: See how it is done in Lesson
    - You can use **random_state=42** (or any other number) if you want to reproduce results.

In [8]:
X = Dataset[columns[:3]]
y = np.array([1 if value in Dataset[columns[3]] == 'Yes' else 0 for value in Dataset[columns[3]]])

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42)

### Step 7: Train and test the model
- Create classifier with **KNeighborsClassifier**
    - You can play around with n_neighbors (default =5)
- Fit the model with training data **(X_train, y_train**)
- Predict data from **X_test** (use predict) and assign to **y_pred**.
- Evalute score by using **metrics.accuracy_score(y_test, y_pred)**.

In [10]:
clf = KNeighborsClassifier(n_neighbors=6)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_valid)
metrics.accuracy_score(y_true=y_valid, y_pred=y_pred, normalize=True)

1.0

In [11]:
len(y_valid)

815

### Step 8 (Optional): Try with different columns
- You can redo with diffrent choise of columns (starting from step 4)

In [12]:
data

Unnamed: 0_level_0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-02-01,19.5,22.4,15.6,6.2,0.0,,,S,SSW,17.0,...,84.0,1017.6,1017.4,8.0,8.0,20.7,20.9,Yes,6.0,Yes
2008-02-02,19.5,25.6,6.0,3.4,2.7,,,W,E,9.0,...,73.0,1017.9,1016.4,7.0,7.0,22.4,24.8,Yes,6.6,Yes
2008-02-03,21.6,24.5,6.6,2.4,0.1,,,ESE,ESE,17.0,...,86.0,1016.7,1015.6,7.0,8.0,23.5,23.0,Yes,18.8,Yes
2008-02-04,20.2,22.8,18.8,2.2,0.0,,,NNE,E,22.0,...,90.0,1014.2,1011.8,8.0,8.0,21.4,20.9,Yes,77.4,Yes
2008-02-05,19.7,25.7,77.4,,0.0,,,NNE,W,11.0,...,74.0,1008.3,1004.8,8.0,8.0,22.5,25.5,Yes,1.6,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-06-21,8.6,19.6,0.0,2.0,7.8,SSE,37.0,W,SSE,22.0,...,52.0,1025.9,1025.3,2.0,2.0,10.5,17.9,No,0.0,No
2017-06-22,9.3,19.2,0.0,2.0,9.2,W,30.0,W,ESE,20.0,...,53.0,1028.5,1024.6,2.0,2.0,11.0,18.7,No,0.0,No
2017-06-23,9.4,17.7,0.0,2.4,2.7,W,24.0,WNW,N,15.0,...,56.0,1020.8,1015.0,6.0,6.0,10.2,17.3,No,0.0,No
2017-06-24,10.1,19.3,0.0,1.4,9.3,W,43.0,W,W,17.0,...,35.0,1017.3,1015.1,5.0,2.0,12.4,19.0,No,0.0,No


In [13]:
# step4
columns = ['Evaporation', 'Sunshine', 'Humidity3pm', 'RainTomorrow']
dataset = data[columns]
dataset

Unnamed: 0_level_0,Evaporation,Sunshine,Humidity3pm,RainTomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2008-02-01,6.2,0.0,84.0,Yes
2008-02-02,3.4,2.7,73.0,Yes
2008-02-03,2.4,0.1,86.0,Yes
2008-02-04,2.2,0.0,90.0,Yes
2008-02-05,,0.0,74.0,Yes
...,...,...,...,...
2017-06-21,2.0,7.8,52.0,No
2017-06-22,2.0,9.2,53.0,No
2017-06-23,2.4,2.7,56.0,No
2017-06-24,1.4,9.3,35.0,No


In [14]:
# step5
Dataset = dataset.copy()
Dataset.dropna(axis=0, inplace=True)
Dataset

Unnamed: 0_level_0,Evaporation,Sunshine,Humidity3pm,RainTomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2008-02-01,6.2,0.0,84.0,Yes
2008-02-02,3.4,2.7,73.0,Yes
2008-02-03,2.4,0.1,86.0,Yes
2008-02-04,2.2,0.0,90.0,Yes
2008-02-06,2.6,8.6,62.0,Yes
...,...,...,...,...
2017-06-21,2.0,7.8,52.0,No
2017-06-22,2.0,9.2,53.0,No
2017-06-23,2.4,2.7,56.0,No
2017-06-24,1.4,9.3,35.0,No


In [15]:
X = Dataset[columns[:3]]
X

Unnamed: 0_level_0,Evaporation,Sunshine,Humidity3pm
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-02-01,6.2,0.0,84.0
2008-02-02,3.4,2.7,73.0
2008-02-03,2.4,0.1,86.0
2008-02-04,2.2,0.0,90.0
2008-02-06,2.6,8.6,62.0
...,...,...,...
2017-06-21,2.0,7.8,52.0
2017-06-22,2.0,9.2,53.0
2017-06-23,2.4,2.7,56.0
2017-06-24,1.4,9.3,35.0


In [16]:
y = Dataset[columns[3]]
y = np.array([1 if value == 'Yes' else 0 for value in y])
y

array([1, 1, 1, ..., 0, 0, 0])

In [17]:
# step6
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42)

In [18]:
# step7
clf = KNeighborsClassifier(n_neighbors=7)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_valid)
metrics.accuracy_score(y_true=y_valid, y_pred=y_pred, normalize=True)

0.8171779141104294