# Project - $k$-Nearest-Neighbors Classifier
- Create a $k$-Nearest-Neighbors Classifier supporting 3 dimensions
- Investigate whether it performs better

### Step 1: Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

### Step 2: Read data
- Use pandas [read_csv](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) method to read **files/weather.csv**
- HINT: Use **parse_dates=True** and **index_col=0**

In [2]:
data = pd.read_csv('files/weather.csv', parse_dates=True, index_col=0)
data.head()

Unnamed: 0_level_0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-02-01,19.5,22.4,15.6,6.2,0.0,,,S,SSW,17.0,...,84.0,1017.6,1017.4,8.0,8.0,20.7,20.9,Yes,6.0,Yes
2008-02-02,19.5,25.6,6.0,3.4,2.7,,,W,E,9.0,...,73.0,1017.9,1016.4,7.0,7.0,22.4,24.8,Yes,6.6,Yes
2008-02-03,21.6,24.5,6.6,2.4,0.1,,,ESE,ESE,17.0,...,86.0,1016.7,1015.6,7.0,8.0,23.5,23.0,Yes,18.8,Yes
2008-02-04,20.2,22.8,18.8,2.2,0.0,,,NNE,E,22.0,...,90.0,1014.2,1011.8,8.0,8.0,21.4,20.9,Yes,77.4,Yes
2008-02-05,19.7,25.7,77.4,,0.0,,,NNE,W,11.0,...,74.0,1008.3,1004.8,8.0,8.0,22.5,25.5,Yes,1.6,Yes


#### **parse_date**
- default: `False`
- if True -> try parsing the index

In [3]:
data = pd.read_csv('files/weather.csv', parse_dates=True)
data.info() # 'Date' column이 datetime type으로 바뀌지 않음 (object)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3337 entries, 0 to 3336
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           3337 non-null   object 
 1   MinTemp        3334 non-null   float64
 2   MaxTemp        3335 non-null   float64
 3   Rainfall       3331 non-null   float64
 4   Evaporation    3286 non-null   float64
 5   Sunshine       3321 non-null   float64
 6   WindGustDir    2301 non-null   object 
 7   WindGustSpeed  2301 non-null   float64
 8   WindDir9am     3281 non-null   object 
 9   WindDir3pm     3304 non-null   object 
 10  WindSpeed9am   3311 non-null   float64
 11  WindSpeed3pm   3312 non-null   float64
 12  Humidity9am    3323 non-null   float64
 13  Humidity3pm    3324 non-null   float64
 14  Pressure9am    3317 non-null   float64
 15  Pressure3pm    3318 non-null   float64
 16  Cloud9am       2771 non-null   float64
 17  Cloud3pm       2776 non-null   float64
 18  Temp9am 

#### **index_col**
- default : `None`
- Column to use as the row labels of the DataFrame
- `index_col=0` : `data.iloc[:, 0]`을 row DataFrame의 row labels로 지정

In [4]:
data.iloc[:, 0] # data['Date']

0       2008-02-01
1       2008-02-02
2       2008-02-03
3       2008-02-04
4       2008-02-05
           ...    
3332    2017-06-21
3333    2017-06-22
3334    2017-06-23
3335    2017-06-24
3336    2017-06-25
Name: Date, Length: 3337, dtype: object

In [5]:
data = pd.read_csv('files/weather.csv', parse_dates=True, index_col=0)
data.info() # DatetimeIndex, 22 columns

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3337 entries, 2008-02-01 to 2017-06-25
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MinTemp        3334 non-null   float64
 1   MaxTemp        3335 non-null   float64
 2   Rainfall       3331 non-null   float64
 3   Evaporation    3286 non-null   float64
 4   Sunshine       3321 non-null   float64
 5   WindGustDir    2301 non-null   object 
 6   WindGustSpeed  2301 non-null   float64
 7   WindDir9am     3281 non-null   object 
 8   WindDir3pm     3304 non-null   object 
 9   WindSpeed9am   3311 non-null   float64
 10  WindSpeed3pm   3312 non-null   float64
 11  Humidity9am    3323 non-null   float64
 12  Humidity3pm    3324 non-null   float64
 13  Pressure9am    3317 non-null   float64
 14  Pressure3pm    3318 non-null   float64
 15  Cloud9am       2771 non-null   float64
 16  Cloud3pm       2776 non-null   float64
 17  Temp9am        3333 non-null   flo

In [6]:
data.index

DatetimeIndex(['2008-02-01', '2008-02-02', '2008-02-03', '2008-02-04',
               '2008-02-05', '2008-02-06', '2008-02-07', '2008-02-08',
               '2008-02-09', '2008-02-10',
               ...
               '2017-06-16', '2017-06-17', '2017-06-18', '2017-06-19',
               '2017-06-20', '2017-06-21', '2017-06-22', '2017-06-23',
               '2017-06-24', '2017-06-25'],
              dtype='datetime64[ns]', name='Date', length=3337, freq=None)

### Step 3: Investigate data types
- Use [dtypes](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dtypes.html)
- The goal is to identify all columns with datatype **float64** for next step

In [7]:
data.dtypes

MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RISK_MM          float64
RainTomorrow      object
dtype: object

#### float64 타입이 아닌 column 빼기

In [8]:
index = data.dtypes == 'object'
index

MinTemp          False
MaxTemp          False
Rainfall         False
Evaporation      False
Sunshine         False
WindGustDir       True
WindGustSpeed    False
WindDir9am        True
WindDir3pm        True
WindSpeed9am     False
WindSpeed3pm     False
Humidity9am      False
Humidity3pm      False
Pressure9am      False
Pressure3pm      False
Cloud9am         False
Cloud3pm         False
Temp9am          False
Temp3pm          False
RainToday         True
RISK_MM          False
RainTomorrow      True
dtype: bool

In [9]:
data.columns

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RISK_MM', 'RainTomorrow'],
      dtype='object')

In [10]:
data.columns[index] # boolean indexing

Index(['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow'], dtype='object')

In [11]:
data.drop(data.columns[index], axis=1, inplace=True)
data.dtypes # data의 type은 object? 

MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustSpeed    float64
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RISK_MM          float64
dtype: object

### Step 4: Choose 3 columns to create datasets
- Use **Humidity3pm** and **Pressure3pm** together with another column to predict **RainTomorrow**
- Make a list of three column names **'Humidity3pm', 'Pressure3pm', INSERT YOUR CHOICE** (should be one with dtype *float64*, e.g., **Cloud3pm**), and **'RainTomorrow'**
- Create the dataset consisting of these 4 columns

In [14]:
header = ['Humidity3pm', 'Pressure3pm', 'Cloud3pm', 'RainTomorrow']
dataset = data[header]

KeyError: "['RainTomorrow'] not in index"

#### KeyError: "['RainTomorrow'] not in index"
- 원인1 : `data.drop(object_columns, axis=1, inplace=True)`
- 원인2 : `data.dtypes == 'object'`
- 해결 : `object_columns` 수정

In [15]:
data = pd.read_csv('files/weather.csv', parse_dates=True, index_col=0)
data.dtypes

MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RISK_MM          float64
RainTomorrow      object
dtype: object

#### object_columns 수정
- `data['RainTomorrow']`가 drop되면 안된다.

In [16]:
object_columns = ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday'] # drop시킬 column들의 names
data.drop(object_columns, axis=1, inplace=True)
data[header] # dataset consisting of these 4 columns

Unnamed: 0_level_0,Humidity3pm,Pressure3pm,Cloud3pm,RainTomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2008-02-01,84.0,1017.4,8.0,Yes
2008-02-02,73.0,1016.4,7.0,Yes
2008-02-03,86.0,1015.6,8.0,Yes
2008-02-04,90.0,1011.8,8.0,Yes
2008-02-05,74.0,1004.8,8.0,Yes
...,...,...,...,...
2017-06-21,52.0,1025.3,2.0,No
2017-06-22,53.0,1024.6,2.0,No
2017-06-23,56.0,1015.0,6.0,No
2017-06-24,35.0,1015.1,2.0,No


In [17]:
#dataset.head()

### Step 5: Deal with remaining missing data
- A simple choice is to simply remove rows with missing data
- Use [dropna()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html)

In [18]:
Dataset = data[header]
Dataset

Unnamed: 0_level_0,Humidity3pm,Pressure3pm,Cloud3pm,RainTomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2008-02-01,84.0,1017.4,8.0,Yes
2008-02-02,73.0,1016.4,7.0,Yes
2008-02-03,86.0,1015.6,8.0,Yes
2008-02-04,90.0,1011.8,8.0,Yes
2008-02-05,74.0,1004.8,8.0,Yes
...,...,...,...,...
2017-06-21,52.0,1025.3,2.0,No
2017-06-22,53.0,1024.6,2.0,No
2017-06-23,56.0,1015.0,6.0,No
2017-06-24,35.0,1015.1,2.0,No


#### remaining missing data
- [참고자료](https://www.kaggle.com/code/alexisbcook/handling-missing-values/tutorial)

In [19]:
missing_val_count_by_column = Dataset.isnull().sum()
missing_val_count_by_column[missing_val_count_by_column > 0]

Humidity3pm     13
Pressure3pm     19
Cloud3pm       561
dtype: int64

- 방법1 : from kaggle 실패 (rows with missing이 필요하기 때문)

In [20]:
# cols_with_missing을 rows_with_missing으로 바꾸고 
# dropna(rows_with_missing, axis=1)으로
# 수정하면 이 코드를 사용할 수 있을 것 같다.
cols_with_missing = [col for col in Dataset.columns
                     if Dataset[col].isnull().any()]
reduced_Dataset = Dataset.drop(cols_with_missing, axis=0)
reduced_Dataset

KeyError: "['Humidity3pm', 'Pressure3pm', 'Cloud3pm'] not found in axis"

#### 방법2 : dropna(axis=0) 성공

In [21]:
Dataset.dropna(axis=0, inplace=True)
Dataset # 3337 rows -> 2754 rows

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Dataset.dropna(axis=0, inplace=True)


Unnamed: 0_level_0,Humidity3pm,Pressure3pm,Cloud3pm,RainTomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2008-02-01,84.0,1017.4,8.0,Yes
2008-02-02,73.0,1016.4,7.0,Yes
2008-02-03,86.0,1015.6,8.0,Yes
2008-02-04,90.0,1011.8,8.0,Yes
2008-02-05,74.0,1004.8,8.0,Yes
...,...,...,...,...
2017-06-21,52.0,1025.3,2.0,No
2017-06-22,53.0,1024.6,2.0,No
2017-06-23,56.0,1015.0,6.0,No
2017-06-24,35.0,1015.1,2.0,No


In [22]:
#dataset_clean = data.dropna()
#len(dataset), len(dataset_clean)

### Step 6: Create training and test datasets
- Define dataset **X** to be the data consisting of the three columns.
- Define dataset **y** to be datset cosisting of **'RainTomorrow'**.
    - HINT: Use list comprehension to transform **'No'** and **'Yes'** to 0 and 1, repectively (like in the Lesson)
- Divide into **X_train, X_test, y_train, y_test** with **train_test_split**
    - HINT: See how it is done in Lesson
    - You can use **random_state=42** (or any other number) if you want to reproduce results.

In [23]:
Dataset

Unnamed: 0_level_0,Humidity3pm,Pressure3pm,Cloud3pm,RainTomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2008-02-01,84.0,1017.4,8.0,Yes
2008-02-02,73.0,1016.4,7.0,Yes
2008-02-03,86.0,1015.6,8.0,Yes
2008-02-04,90.0,1011.8,8.0,Yes
2008-02-05,74.0,1004.8,8.0,Yes
...,...,...,...,...
2017-06-21,52.0,1025.3,2.0,No
2017-06-22,53.0,1024.6,2.0,No
2017-06-23,56.0,1015.0,6.0,No
2017-06-24,35.0,1015.1,2.0,No


#### features X

In [24]:
X = Dataset.loc[:, :'Cloud3pm']
X # X consist of the first three columns

Unnamed: 0_level_0,Humidity3pm,Pressure3pm,Cloud3pm
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-02-01,84.0,1017.4,8.0
2008-02-02,73.0,1016.4,7.0
2008-02-03,86.0,1015.6,8.0
2008-02-04,90.0,1011.8,8.0
2008-02-05,74.0,1004.8,8.0
...,...,...,...
2017-06-21,52.0,1025.3,2.0
2017-06-22,53.0,1024.6,2.0
2017-06-23,56.0,1015.0,6.0
2017-06-24,35.0,1015.1,2.0


#### target y

In [25]:
y = Dataset.loc[:, 'RainTomorrow']
y # y cosist of 'RainTomorrow'

Date
2008-02-01    Yes
2008-02-02    Yes
2008-02-03    Yes
2008-02-04    Yes
2008-02-05    Yes
             ... 
2017-06-21     No
2017-06-22     No
2017-06-23     No
2017-06-24     No
2017-06-25     No
Name: RainTomorrow, Length: 2754, dtype: object

####  label encoding
- transform 'No' and 'Yes' to 0 and 1, repectively

- 방법1 : [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) (Encode target labels with value between 0 and n_classes-1.)

In [26]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y)
le.classes_ # non-numerical labels

array(['No', 'Yes'], dtype=object)

In [27]:
encoded_labels = le.fit_transform(y)
encoded_labels

array([1, 1, 1, ..., 0, 0, 0])

#### Holdout Cross Validation을 위한 split

In [28]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42)

- 방법2: Rune's way

In [29]:
"""
X = dataset_clean[header[:3]]
y = dataset_clean[header[3]]
"""
# y = np.array([0 if value == 'No' else 1 for value in y])
#y # got exactly same result with method 

'\nX = dataset_clean[header[:3]]\ny = dataset_clean[header[3]]\n'

### Step 7: Train and test the model
- Create classifier with **KNeighborsClassifier**
    - You can play around with n_neighbors (default =5)
- Fit the model with training data **(X_train, y_train**)
- Predict data from **X_test** (use predict) and assign to **y_pred**.
- Evalute score by using **metrics.accuracy_score(y_test, y_pred)**.

In [30]:
knn = KNeighborsClassifier(n_neighbors=6) # 임의로 수정 to 6
X_train

Unnamed: 0_level_0,Humidity3pm,Pressure3pm,Cloud3pm
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-09-06,58.0,1014.0,1.0
2010-03-29,56.0,1016.4,8.0
2013-06-09,63.0,1024.3,7.0
2013-04-14,54.0,1010.9,2.0
2013-11-01,61.0,1020.2,7.0
...,...,...,...
2014-06-01,70.0,1018.0,8.0
2012-10-07,32.0,1013.7,1.0
2012-11-11,45.0,1026.6,2.0
2013-06-22,83.0,1024.2,7.0


In [31]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=6)

In [32]:
y_pred = knn.predict(X_valid)
y_pred

array(['No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No',
       'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No',
       'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes',
       'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 

#### inverse_transform 연습

In [33]:
# LabelEncoder 연습 삼아 y_pred를 inverse_transform해보자
le.inverse_transform(y_pred) # # 1d array인 y_pred를 역전환

ValueError: y contains previously unseen labels: ['No' 'Yes']

#### y contains previously unseen labels: ['No' 'Yes']
- 원인1 : train_test_split(X, `y`, random_state=42)
- 원인2 : label encoding한 label(`encoded_labels`)이 아닌 `y`를 train_test_split에 전달했기 때문
- 해결 :  train_test_split(X, `encoded_labels`, random_state=42)

In [34]:
encoded_labels

array([1, 1, 1, ..., 0, 0, 0])

In [35]:
X_train, X_valid, y_train, y_valid = train_test_split(X, encoded_labels, random_state=42)

In [36]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_valid)
y_pred[:5]

array([0, 0, 1, 0, 0])

In [37]:
le.inverse_transform(y_pred)[:5]

array(['No', 'No', 'Yes', 'No', 'No'], dtype=object)

In [38]:
"""
neigh = KNeighborsClassifier()
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)
metrics.accuracy_score(y_test, y_pred)
"""

'\nneigh = KNeighborsClassifier()\nneigh.fit(X_train, y_train)\ny_pred = neigh.predict(X_test)\nmetrics.accuracy_score(y_test, y_pred)\n'

#### [accuracy_score 1](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)
- [accuracy_score user guide](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)
- subset accuracy?

The accuracy_score function computes the accuracy, either the fraction(default) or the mount of the count (normalize=False) of correct predictions.

In multilabel classfication, the function returns the subset accuracy. If the entire set of predicted labels for a sample strictly match with the true set of labels, then the subset accuracy is 1.0; otherwise it is 0.0.

![score_accuracy.PNG](attachment:score_accuracy.PNG)

**parameters**
- `y_true` : 1d array, truth(correct) labels
- `y_pred` : predicted labels
- `normalize` : bool, default=True


**Returns**
<br>
If `normalize == True`, return the fraction of correctly classified samples (float), else returns the number of correctly classified samples (int).

example

In [39]:
import numpy as np
# y_pred = [0, 2, 1, 3]
# y_true = [0, 1, 2, 3]

metrics.accuracy_score(y_true, y_pred) # 0.5 = 2 / 4 
# the number of correctly classified samples : 2
# number of all samples : 4
# formula : if normalize=True, then 
#   number of correctly classified samples / number of all samples

NameError: name 'y_true' is not defined

In [40]:
# metrics.accuracy_score(y_true, y_pred, normalize=False) # the number of correctly classified samples

back to our problem

#### ValueError: y contains previously unseen labels: [2 3]???????????
- 원인: y_pred = [0, 2, 1, 3], y_true = [0, 1, 2, 3]를 execute하고 accuracy_score(y_pred=, y_true=)로 해서 발생

In [41]:
metrics.accuracy_score(le.inverse_transform(y_pred),
                       le.inverse_transform(y_valid),
                       normalize=True)

0.8127721335268505

In [42]:
metrics.accuracy_score(y_pred=le.inverse_transform(y_pred),
                       y_true=le.inverse_transform(y_valid),
                       normalize=False) # the number of correctly classified samples

560

In [43]:
metrics.accuracy_score(y_pred, y_valid) # 약 82%의 정확도로 비가 온다 안온다를 예측했다. 

0.8127721335268505

Rune : "The cloud measurement at 3pm actually add some accuracy to it"
같은 결론은 위험하다. 샘플 수가 줄어 든 것이 accuracy를 향상시켰을 수도 있기 때문이다. Higly advise you to play around with this. Rune : It's not the goal to get 100% down there.

### Step 8 (Optional): Try with different columns
- You can redo with diffrent choise of columns (starting from step 4)

In [44]:
data.dtypes

MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustSpeed    float64
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RISK_MM          float64
RainTomorrow      object
dtype: object

In [45]:
header = ['WindSpeed9am', 'Pressure9am', 'Cloud9am', 'RainTomorrow']
dataset = data[header]

In [46]:
dataset.head()

Unnamed: 0_level_0,WindSpeed9am,Pressure9am,Cloud9am,RainTomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2008-02-01,17.0,1017.6,8.0,Yes
2008-02-02,9.0,1017.9,7.0,Yes
2008-02-03,17.0,1016.7,7.0,Yes
2008-02-04,22.0,1014.2,8.0,Yes
2008-02-05,11.0,1008.3,8.0,Yes


In [47]:
dataset_clean = data.dropna()
len(dataset), len(dataset_clean)

(3337, 1696)

In [48]:
X = dataset_clean[header[:3]]
y = dataset_clean[header[3]]
y = np.array([0 if value == 'No' else 1 for value in y])

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [50]:
neigh = KNeighborsClassifier()
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

0.7264150943396226