In [1]:
import numpy as np
from sklearn.model_selection import train_test_split  # type: ignore
from sklearn.preprocessing import StandardScaler  # type: ignore
# import dependencies
import pandas as pd
import sqlalchemy as sq
import sys, os
from imblearn.combine import SMOTEENN
from sklearn.neighbors import KNeighborsClassifier  # type: ignore

from sklearn.metrics import (  # type: ignore
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
)

sys.path.append("../../")
os.chdir("../../")
from ModelBuilderMethods import getConn, extractYears

In [2]:
# unlimited line output
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 500)

In [3]:
# Set the query text
weatherStationQuery = sq.text(
    """
    SELECT * from dataset_cross_monthly_station
"""
)

ergotTargetQuery = sq.text(
    """
    SELECT year, district, downgrade from ergot_sample_feat_eng
"""
)

In [4]:
conn = getConn("./.env")

stationDf = pd.read_sql(weatherStationQuery, conn)
ergotTargetDf = pd.read_sql(ergotTargetQuery, conn)

conn.close()
del conn

In [5]:
tempdf = stationDf

# merge on year and district
datasetDf = pd.merge(ergotTargetDf, tempdf, on=["year", "district"], how="left")
del ergotTargetDf
del tempdf

In [6]:
# encode district
datasetDf["district"] = datasetDf["district"].astype("category")

temp = pd.get_dummies(datasetDf["district"], prefix="district", drop_first=True)
datasetDf = pd.concat([datasetDf, temp], axis=1)

datasetDf = datasetDf.drop(columns=["district"])

del temp

In [7]:
# train 1995 - 2015 test 2016 - 2020
trainDf = extractYears(datasetDf, 1995, 2015)
testDf = extractYears(datasetDf, 2016, 2020)
del datasetDf

In [8]:
# drop year
trainDf = trainDf.drop(columns=["year"])
testDf = testDf.drop(columns=["year"])

In [9]:
# pre balancing check
# print value counts downgrade
print(trainDf["downgrade"].value_counts())
print(testDf["downgrade"].value_counts())

downgrade
False    122202
True       2082
Name: count, dtype: int64
downgrade
False    26307
True      1016
Name: count, dtype: int64


In [10]:
# count nan
print(trainDf.isna().sum())
# set nan to 0
trainDf = trainDf.fillna(0)

downgrade                    0
1:min_temp_x              1246
1:max_temp_x              1246
1:mean_temp_x             1246
1:min_dew_point_temp      1246
1:max_dew_point_temp      1246
1:mean_dew_point_temp     1246
1:min_humidex             1246
1:max_humidex             1246
1:mean_humidex            1246
1:min_precip              1246
1:max_precip              1246
1:mean_precip             1246
1:min_rel_humid           1246
1:max_rel_humid           1246
1:mean_rel_humid          1246
1:min_stn_press           1246
1:max_stn_press           1246
1:mean_stn_press          1246
1:min_visibility          1246
1:max_visibility          1246
1:mean_visibility         1246
1:max_temp_y              1246
1:min_temp_y              1246
1:mean_temp_y             1246
1:min_total_rain          1246
1:max_total_rain          1246
1:mean_total_rain         1246
1:min_total_snow          1246
1:max_total_snow          1246
1:mean_total_snow         1246
1:min_total_precip        1246
1:max_to

In [11]:
balancer = SMOTEENN(sampling_strategy=1, random_state=42)
balancedTrainDfX, balancedTrainDfY = balancer.fit_resample(
    trainDf.drop(columns="downgrade"), trainDf["downgrade"]
)

In [12]:
print(balancedTrainDfY.value_counts())

downgrade
False    115179
True      23757
Name: count, dtype: int64


### <u>**Step 5**</u>: KNN Model

In [13]:
model = KNeighborsClassifier(n_neighbors=1)