In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [33]:
X_train = pd.read_csv('../../data/dengue_features_train.csv')
y_train = pd.read_csv('../../data/dengue_labels_train.csv', usecols=['total_cases'])

In [34]:
def run_rf(df):
    #df = knn_impute(df)
    X = df.drop('total_cases', axis=1)
    y = df['total_cases']
    
    X = knn_impute(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42, stratify=df['city'])
    
    rf_model = RandomForestRegressor(random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    print(f'MAE:{mean_absolute_error(y_test, y_pred)}')
    return rf_model

In [35]:
def knn_impute(df):
    scaled_df = scaling(df)
    imputer = KNNImputer(n_neighbors=5)
    imputed = imputer.fit_transform(scaled_df)
    
    return pd.DataFrame(imputed, columns=df.columns)

In [36]:
def scaling(df):
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
    return df_scaled

In [37]:
def data_preprocess(df):
    # drop or encode categorical cols
    df_processed = df.drop('week_start_date', axis=1)
    df_processed['city'] = df_processed['city'].apply(lambda x : 1 if x=='iq' else 0)
    return df_processed

In [39]:
df = pd.concat([X_train, y_train], axis=1)
df_processed = data_preprocess(df)
rf_model = run_rf(df_processed)

MAE:12.832431506849314


In [62]:
X_submit = pd.read_csv('../../data/dengue_features_test.csv')
y_submit = pd.read_csv('../../data/submission_format.csv')

In [63]:
X_submit_processed = data_preprocess(X_submit)
X_submit_imputed = knn_impute(X_submit_processed)
y_submit_pred = np.rint(rf_model.predict(X_submit_imputed))
y_submit_pred

array([ 17.,  17.,  24.,  26.,  20.,  22.,  22.,  73.,  23.,  29.,  27.,
        34.,  57.,  95.,  62., 156.,  77.,  92., 174.,  93.,  87., 101.,
       111., 102., 120.,  95., 132.,  99., 179., 125., 118., 144., 103.,
       101.,  87.,  35.,  49.,  38.,  32.,  36.,  30.,  25.,  25.,  30.,
        29.,  22.,  22.,  19.,  24.,  20.,  21.,  20.,  21.,  21.,  20.,
        20.,  24.,  24.,  22.,  27.,  34.,  48.,  46.,  45.,  53.,  94.,
        93.,  94., 116., 144., 211., 178., 215., 205., 216., 205., 193.,
       174., 251., 222., 212., 225., 204., 208., 197., 213., 214.,  89.,
        49.,  57.,  60.,  52.,  51.,  50.,  52.,  50.,  45.,  47.,  41.,
        44.,  40.,  40.,  34.,  39.,  45.,  63.,  44.,  49.,  43.,  84.,
        85.,  67.,  74.,  91.,  97., 110., 103., 112., 121., 179., 160.,
       149., 153., 141.,  87., 116.,  99.,  74., 105.,  86., 181., 109.,
        89.,  80.,  98., 108.,  93.,  84.,  87.,  26.,  18.,  17.,  19.,
        20.,  19.,  16.,  16.,  11.,  11.,   8.,   

In [65]:
y_submit['total_cases'] = y_submit_pred
y_submit['total_cases'] = y_submit['total_cases'].astype(int)

In [66]:
y_submit.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,17
1,sj,2008,19,17
2,sj,2008,20,24
3,sj,2008,21,26
4,sj,2008,22,20


In [67]:
y_submit.tail(10)

Unnamed: 0,city,year,weekofyear,total_cases
406,iq,2013,17,6
407,iq,2013,18,5
408,iq,2013,19,4
409,iq,2013,20,5
410,iq,2013,21,3
411,iq,2013,22,5
412,iq,2013,23,5
413,iq,2013,24,3
414,iq,2013,25,4
415,iq,2013,26,3


## "dengue_submission_01_rf01.csv" is the initial submission with Random Forest, Standard Scaling and KNN imputing (n_neighbors = 5)

In [70]:
y_submit.to_csv('../../data/dengue_submission_01_rf01.csv', index = False)

In [71]:
y_sub = pd.read_csv('../../data/dengue_submission_01_rf01.csv')
y_sub.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,17
1,sj,2008,19,17
2,sj,2008,20,24
3,sj,2008,21,26
4,sj,2008,22,20
