## KNN For Satellite Weather

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split  # type: ignore
from sklearn.preprocessing import StandardScaler  # type: ignore

# import dependencies
import pandas as pd
import sqlalchemy as sq
import sys, os
from imblearn.combine import SMOTEENN
from sklearn.neighbors import KNeighborsClassifier  # type: ignore

from sklearn.metrics import (  # type: ignore
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
)

sys.path.append("../../")
os.chdir("../../")
from ModelBuilderMethods import getConn, extractYears

In [2]:
# unlimited line output
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 500)

### <u>**Step 1**</u>: Data Selection

In this step, we would choose the particular data/table, pick attributes from existing tables. Further aggregation/feature engineer can be done here to support the point of the research.

Particular, for this notebook, we grab the following data and merge them (on year, district) into a single table:
- Monthly weather satellite data
- ergot data (downgrade)

In [3]:
# Set the query text
weatherSatQuery = sq.text(
    """
    SELECT * from dataset_cross_monthly_sat
"""
)

ergotTargetQuery = sq.text(
    """
    SELECT year, district, downgrade from ergot_sample_feat_eng
"""
)

In [4]:
conn = getConn("./.env")

satelliteDf = pd.read_sql(weatherSatQuery, conn)
ergotTargetDf = pd.read_sql(ergotTargetQuery, conn)

conn.close()
del conn

In [5]:
tempdf = satelliteDf

# merge on year and district
datasetDf = pd.merge(ergotTargetDf, tempdf, on=["year", "district"], how="left")
del ergotTargetDf
del tempdf

In [6]:
# encode district
datasetDf["district"] = datasetDf["district"].astype("category")

temp = pd.get_dummies(datasetDf["district"], prefix="district", drop_first=True)
datasetDf = pd.concat([datasetDf, temp], axis=1)

datasetDf = datasetDf.drop(columns=["district"])

del temp

### <u>**Step 2**</u>: Splitting dataset

- We split the whole dataset into the train/test split. Particularly, split them by year (1995 - 2015 for training, 2016 - 2020 for testing) since this is a time series data.

In [7]:
# train 1995 - 2015 test 2016 - 2020
trainDf = extractYears(datasetDf, 1995, 2015)
testDf = extractYears(datasetDf, 2016, 2020)
del datasetDf

In [8]:
# drop year
trainDf = trainDf.drop(columns=["year"])
testDf = testDf.drop(columns=["year"])

### <u>**Step 3**</u>: [Balancing the dataset](https://imbalanced-learn.org/stable/)

- Our dataset is unbalanced and can lead to bias when training/testing. Balacing step would help to eliminate the bias of the dataset, thus provide more reliable results.

In [9]:
# pre balancing check
# print value counts downgrade
print(trainDf["downgrade"].value_counts())
print(testDf["downgrade"].value_counts())

downgrade
False    122202
True       2082
Name: count, dtype: int64
downgrade
False    26307
True      1016
Name: count, dtype: int64


In [10]:
# count nan
print(trainDf.isna().sum())
# set nan to 0
trainDf = trainDf.fillna(0)

downgrade                           0
1:min_dewpoint_temperature          0
1:min_temperature                   0
1:min_evaporation_from_bare_soil    0
1:min_skin_reservoir_content        0
                                   ..
district_4830                       0
district_4840                       0
district_4850                       0
district_4860                       0
district_4870                       0
Length: 687, dtype: int64


In [11]:
balancer = SMOTEENN(sampling_strategy=1, random_state=42)
balancedTrainDfX, balancedTrainDfY = balancer.fit_resample(
    trainDf.drop(columns="downgrade"), trainDf["downgrade"]
)

In [12]:
print(balancedTrainDfY.value_counts())

downgrade
False    115239
True      25156
Name: count, dtype: int64


### <u>**Step 4**</u>: Regularization / Normalization
some blurb about scalers  

1. [MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)             
2. [MaxAbsScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html)  
3. [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)  
4. [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)  
5. [Normalizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html)  
6. [PowerTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html)  
7. [QuantileTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html)  

In [13]:
def printMetrics(model_name, y_true, y_pred):
    print(model_name)
    print("Accuracy: ", accuracy_score(y_true, y_pred))
    print("Precision: ", precision_score(y_true, y_pred))
    print("Recall: ", recall_score(y_true, y_pred))
    print("F1: ", f1_score(y_true, y_pred))
    print("ROC AUC: ", roc_auc_score(y_true, y_pred))
    print("Classification Report: \n", classification_report(y_true, y_pred))
    print()

### <u>**Step 5**</u>: KNN Model

##### <u>**Step 5.1**</u>: Initialize the model

In [14]:
knn_model = KNeighborsClassifier(n_neighbors=1)
balanced_knn_model = KNeighborsClassifier(n_neighbors=1)

##### <u>**Step 5.2**</u>: Fit the training data to the model

In [15]:
knn_model.fit(trainDf.drop(columns="downgrade"), trainDf["downgrade"])
balanced_knn_model.fit(balancedTrainDfX, balancedTrainDfY)

##### <u>**Step 5.3**</u>: Test the model on the testing dataset

In [16]:
predictions_knn = knn_model.predict(testDf.drop(columns="downgrade"))
predictions_balanced_knn = balanced_knn_model.predict(testDf.drop(columns="downgrade"))

In [17]:
print(pd.DataFrame(predictions_knn).value_counts())
print(pd.DataFrame(predictions_balanced_knn).value_counts())

False    26870
True       453
Name: count, dtype: int64
False    14342
True     12981
Name: count, dtype: int64


##### <u>**Step 5.4**</u>: Evaluate models based on different metrics:
- ACCURACY:
- PRECISION:
- RECALL:
- F1:
- ROC AUC:

In [18]:
printMetrics("KNN imbalanced train set", testDf["downgrade"], predictions_knn)
printMetrics(
    "KNN balanced train set",
    testDf["downgrade"],
    predictions_balanced_knn,
)

KNN imbalanced train set
Accuracy:  0.9466749624858178
Precision:  0.013245033112582781
Recall:  0.005905511811023622
F1:  0.008168822328114363
ROC AUC:  0.49445691829575017
Classification Report: 
               precision    recall  f1-score   support

       False       0.96      0.98      0.97     26307
        True       0.01      0.01      0.01      1016

    accuracy                           0.95     27323
   macro avg       0.49      0.49      0.49     27323
weighted avg       0.93      0.95      0.94     27323


KNN balanced train set
Accuracy:  0.5178787102441167
Precision:  0.03173869501579231
Recall:  0.40551181102362205
F1:  0.058869757805243984
ROC AUC:  0.4638651159881101
Classification Report: 
               precision    recall  f1-score   support

       False       0.96      0.52      0.68     26307
        True       0.03      0.41      0.06      1016

    accuracy                           0.52     27323
   macro avg       0.49      0.46      0.37     27323
weighte