In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
X_train = pd.read_csv('../../data/dengue_features_train.csv')
y_train = pd.read_csv('../../data/dengue_labels_train.csv', usecols=['total_cases'])

scaler = StandardScaler()
imputer = KNNImputer(n_neighbors=5)
rf_model = RandomForestRegressor(random_state=42)

In [3]:
def run_rf_train(df):
    #df = knn_impute(df)
    X = df.drop('total_cases', axis=1)
    y = df['total_cases']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42, stratify=df['city'])
    
    #X = knn_impute(X)
    
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
    
    X_train_scaled_imputed = pd.DataFrame(imputer.fit_transform(X_train_scaled), columns = X_train.columns)
                                
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)
    
    X_test_scaled_imputed = pd.DataFrame(imputer.transform(X_test_scaled), columns=X_train.columns)
                                          
    rf_model.fit(X_train_scaled_imputed, y_train)
    y_pred = rf_model.predict(X_test_scaled_imputed)
    print(f'MAE:{mean_absolute_error(y_test, y_pred)}')

In [None]:
def knn_impute(df):
    scaled_df = scaling(df)
    imputer = KNNImputer(n_neighbors=5)
    imputed = imputer.fit_transform(scaled_df)
    
    return pd.DataFrame(imputed, columns=df.columns)

In [None]:
def scaling(df):
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
    return df_scaled

In [4]:
def data_preprocess(df):
    # drop or encode categorical cols
    df_processed = df.drop('week_start_date', axis=1)
    df_processed['city'] = df_processed['city'].apply(lambda x : 1 if x=='iq' else 0)
    return df_processed

In [6]:
df = pd.concat([X_train, y_train], axis=1)
df_processed = data_preprocess(df)
run_rf_train(df_processed)

MAE:12.774931506849315


In [7]:
X_submit = pd.read_csv('../../data/dengue_features_test.csv')
y_submit = pd.read_csv('../../data/submission_format.csv')

In [8]:
X_submit_processed = data_preprocess(X_submit)

X_submit_scaled = pd.DataFrame(scaler.transform(X_submit_processed), columns=X_submit_processed.columns)

X_submit_imputed = pd.DataFrame(imputer.transform(X_submit_scaled), columns=X_submit_processed.columns)

y_submit_pred = np.rint(rf_model.predict(X_submit_imputed))
y_submit_pred

array([ 5.,  5.,  6.,  8., 10., 10., 14., 16., 18., 21., 23., 25., 28.,
       34., 53., 57., 46., 61., 80., 73., 68., 46., 39., 54., 47., 30.,
       30., 41., 36., 32., 26., 26., 13., 17., 18., 16., 14., 16., 14.,
       14., 12., 11.,  8., 11.,  7.,  7.,  7.,  5.,  5.,  4.,  6.,  5.,
        6.,  6.,  8.,  6., 10.,  8.,  8., 11., 11., 27., 33., 33., 39.,
       50., 56., 59., 70., 62., 68., 38., 63., 63., 72., 71., 78., 72.,
       53., 32., 32., 39., 39., 31., 22., 24., 25., 23., 16., 13., 16.,
       17., 12., 18., 14., 12., 14., 10.,  7., 13., 15.,  7.,  5.,  5.,
       12., 13., 12., 18., 12., 31., 38., 42., 23., 30., 45., 50., 34.,
       25., 52., 69., 68., 73., 95., 89., 75., 76., 73., 73., 98., 64.,
       53., 40., 39., 26., 21., 20., 30., 18., 20., 22., 15., 12., 17.,
       17., 14., 12., 14., 10.,  9.,  6.,  8.,  6.,  7.,  6.,  6.,  5.,
        6., 11.,  7.,  5.,  9., 13., 16., 31., 55., 44., 27., 34., 34.,
       40., 35., 46., 59., 52., 60., 59., 66., 34., 50., 68., 66

In [9]:
y_submit['total_cases'] = y_submit_pred
y_submit['total_cases'] = y_submit['total_cases'].astype(int)

In [10]:
y_submit.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,5
1,sj,2008,19,5
2,sj,2008,20,6
3,sj,2008,21,8
4,sj,2008,22,10


In [11]:
y_submit.tail(10)

Unnamed: 0,city,year,weekofyear,total_cases
406,iq,2013,17,6
407,iq,2013,18,5
408,iq,2013,19,6
409,iq,2013,20,6
410,iq,2013,21,3
411,iq,2013,22,5
412,iq,2013,23,4
413,iq,2013,24,2
414,iq,2013,25,4
415,iq,2013,26,3


## "dengue_submission_01_rf01.csv" is the initial submission with Random Forest, Standard Scaling and KNN imputing (n_neighbors = 5)

## "dengue_submission_02_rf01.csv" fixes data leakage

In [13]:
y_submit.to_csv('../../data/dengue_submission_02_rf01.csv', index = False)

In [14]:
y_sub = pd.read_csv('../../data/dengue_submission_02_rf01.csv')
y_sub.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,5
1,sj,2008,19,5
2,sj,2008,20,6
3,sj,2008,21,8
4,sj,2008,22,10
