In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
from utils import *

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, make_scorer, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV

from prophet.diagnostics import cross_validation, performance_metrics
from prophet.serialize import model_to_json, model_from_json

from datetime import datetime, timedelta

Importing plotly failed. Interactive plots will not work.


In [2]:
LEN_DAY_WINDOW = 10
LEN_DAY_TARGET = 3

In [3]:
def are_adjacent(dates):
    for i in range(len(dates) - 1):
        date1 = dates[i]
        date2 = dates[i+1]

        date1_obj = datetime.strptime(date1, '%Y-%m-%d')
        date2_obj = datetime.strptime(date2, '%Y-%m-%d')

        date_diff = abs((date1_obj - date2_obj).days)

        if date_diff > 1:
            return False
    return True

In [4]:
def get_weather_features(weather_data_path = "data/weather_data.csv"):
    weather_data = pd.read_csv(weather_data_path)
    weather_data.drop(['Unnamed: 0', 'snow', 'wpgt', 'tsun', 'prcp', 'tmin', 'tmax'], axis=1, inplace=True)
    weather_data['date'] = pd.to_datetime(weather_data['date'])
    weather_data['month_day'] = weather_data['date'].dt.strftime('%m-%d')
    weather_data = weather_data.groupby('month_day').mean().reset_index()

    return weather_data

In [5]:
def reshape_data(df, target = 'AMBROSIA', stride = 1, train = True):
    df = df.drop(['location', 'Unnamed: 0'], axis = 1)
    upper = LEN_DAY_WINDOW + LEN_DAY_TARGET + 1
    if not train:
       upper = LEN_DAY_WINDOW
    X = []
    y = []
    for i in range(0, df.shape[0] - (upper - 1), stride):
        ran = range(i, i + LEN_DAY_WINDOW)

        dates = list(df.iloc[ran]['date'].values)
        X.append(np.array(df.iloc[ran]).flatten())
        
        if train:
            y.append((
                float(df.iloc[[i+LEN_DAY_WINDOW + 1]][target]),
                float(df.iloc[[i+LEN_DAY_WINDOW + 2]][target]),
                float(df.iloc[[i+LEN_DAY_WINDOW + 3]][target]),
            ))
    return X, y

In [6]:
def load_train_test(train_path, test_path, stride = 1, target = 'AMBROSIA'):
    df_train = pd.read_csv(train_path)
    df_test = pd.read_csv(test_path)

    weather_df = get_weather_features()
    df_train['month_day'] = pd.to_datetime(df_train['date']).dt.strftime('%m-%d')
    df_train = df_train.merge(weather_df, on = 'month_day', how = 'left')
    df_train.drop(['month_day'], axis = 1, inplace = True)

    df_test['month_day'] = pd.to_datetime(df_test['date']).dt.strftime('%m-%d')
    df_test = df_test.merge(weather_df, on = 'month_day', how = 'left')
    df_test.drop(['month_day'], axis = 1, inplace = True)

    locations = df_train['location'].unique()
    batch_id_loc = {}
    for loc in locations:
        batch_id_loc[loc] = df_test[df_test['location'] == loc]['batch_id'].unique()

    df_test = df_test.drop('batch_id', axis = 1)

    train_data = {}
    train_targets = {}
    test_data = {}

    for location in locations:
        train_data[location] = df_train[df_train['location'] == location]
        train_data[location], train_targets[location] = reshape_data(train_data[location], target, stride)
        test_data[location] = df_test[df_test['location'] == location]
        test_data[location], _ = reshape_data(test_data[location], target = target, stride = LEN_DAY_WINDOW, train = False)

    
    df_train.drop(['location', 'Unnamed: 0'], axis = 1, inplace = True)
    column_names = df_train.columns
    
    return train_data, train_targets, test_data, batch_id_loc, column_names

In [7]:
def get_colname_by_index(column_names, index):
    return column_names[index % len(column_names)]

In [8]:
def return_data_location(X_train, X_test, y, location, column_names):
    X_train_loc = pd.DataFrame(X_train[location])
    y_loc = y[location]
    X_test_loc = pd.DataFrame(X_test[location])

    X_train_loc.columns = [get_colname_by_index(column_names, i) + "_" + str(i // len(column_names)) for i in range(X_train_loc.shape[1])]
    X_test_loc.columns = [get_colname_by_index(column_names, i) + "_" + str(i // len(column_names)) for i in range(X_test_loc.shape[1])]

    return X_train_loc, y_loc, X_test_loc

In [9]:
def get_pred_from_prophet(dates):
    dataset_path = "data/"

    with open('models/prophet.json', 'r') as fin:
        m = model_from_json(fin.read())  # Load model

    future = pd.DataFrame({'ds': dates})
    future['floor'] = 0
    future['cap'] = 2500

    forecast = m.predict(future)

    ans = []
    for forecast_row in forecast.itertuples():
        val = int(forecast_row.yhat)
        val = max(0, val)
        ans.append(val)

    return ans

In [10]:
def add_prophet_features(df):
    date_columns = []
    for i, col in enumerate(df.columns):
        if col.startswith('date'): date_columns.append(col)

    last_dates = list(df[date_columns[-1]])
    last_dates = [datetime.strptime(last_date, '%Y-%m-%d') for last_date in last_dates]

    day11 = [last_date + timedelta(days=1) for last_date in last_dates]
    day11 = [d.strftime('%Y-%m-%d') for d in day11]

    day12 = [last_date + timedelta(days=2) for last_date in last_dates]
    day12 = [d.strftime('%Y-%m-%d') for d in day12]

    day13 = [last_date + timedelta(days=3) for last_date in last_dates]
    day13 = [d.strftime('%Y-%m-%d') for d in day13]

    df['prophet_1'] = get_pred_from_prophet(day11)
    df['prophet_2'] = get_pred_from_prophet(day12)
    df['prophet_3'] = get_pred_from_prophet(day13)

    return df

In [11]:
def return_targets_day(y_loc, day):
    y_loc_day = [x[day] for x in y_loc]
    
    return y_loc_day

In [12]:
train_data, train_targets, test_data, batch_id, column_names = load_train_test('data/pollen_train.csv', 'data/pollen_test.csv', stride = 1, target = 'AMBROSIA')

  weather_data = weather_data.groupby('month_day').mean().reset_index()


   Unnamed: 0                location        date  ACER  ALNUS  AMBROSIA  \
0         265  БЕОГРАД - НОВИ БЕОГРАД  2016-02-02     0      0         0   
1         266  БЕОГРАД - НОВИ БЕОГРАД  2016-02-03     0      1         0   
2         267  БЕОГРАД - НОВИ БЕОГРАД  2016-02-04     0      0         0   
3         268  БЕОГРАД - НОВИ БЕОГРАД  2016-02-05     0      2         0   
4         269  БЕОГРАД - НОВИ БЕОГРАД  2016-02-06     0      0         0   

   ARTEMISIA  BETULA  CANNABACEAE  CARPINUS  ...  QUERCUS  RUMEX  SALIX  \
0          0       0            0         0  ...        0      0      0   
1          0       0            0         0  ...        0      0      0   
2          0       0            0         0  ...        0      0      0   
3          0       0            0         0  ...        0      0      0   
4          0       0            0         0  ...        0      0      0   

   TILIA  ULMACEAE  URTICACEAE      tavg        wdir       wspd         pres  
0      0     

In [13]:
N_FEATURES_DAY = len(column_names)
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

def predict_three_days(X, X_test, y):
    model11 = xgb.XGBRegressor()
    model11.fit(X, return_targets_day(y, 0))
    day11 = model11.predict(X)
    day11[day11 < 0] = 0
    day11 = day11.astype(int)
    yhat1 = model11.predict(X_test)

    error = np.mean(np.abs(cross_val_score(model11, X, y, cv=5, scoring='neg_mean_absolute_error')))
    print("MAE: ", error)

    X = X.iloc[:, N_FEATURES_DAY:]
    X['day11'] = day11

    X_test = X_test.iloc[:, N_FEATURES_DAY:]
    X_test['day11'] = yhat1

    model12 = xgb.XGBRegressor()
    model12.fit(X, return_targets_day(y, 1))
    day12 = model12.predict(X)
    day12[day12 < 0] = 0
    day12 = day12.astype(int)
    yhat2 = model12.predict(X_test)

    X = X.iloc[:, N_FEATURES_DAY:]
    X['day12'] = day12

    X_test = X_test.iloc[:, N_FEATURES_DAY:]
    X_test['day12'] = yhat2

    model13 = xgb.XGBRegressor()
    model13.fit(X, return_targets_day(y, 2))
    day13 = model13.predict(X)
    day13[day13 < 0] = 0
    day13 = day13.astype(int)
    yhat3 = model13.predict(X_test)


    yhat1[yhat1 < 0] = 0
    yhat1 = yhat1.astype(int)
    yhat2[yhat2 < 0] = 0
    yhat2 = yhat2.astype(int)
    yhat3[yhat3 < 0] = 0
    yhat3 = yhat3.astype(int)

    return yhat1, yhat2, yhat3, error

In [14]:
def handle_dates(df, keep_date=False):
    date_columns = []
    for i, col in enumerate(df.columns):
        if col.startswith('date'): date_columns.append(col)
    
    if keep_date:
        for col in date_columns:
            df[col] = pd.to_datetime(df[col])
            df['day'] = df[col].dt.day
            df['month'] = df[col].dt.month
            df['year'] = df[col].dt.year - 2017
            df.drop([col], axis=1, inplace=True)
    else:
        for col in date_columns:
            df.drop([col], axis=1, inplace=True)

    return df

In [15]:
def get_locations(train_path):
    df_train = pd.read_csv(train_path)
    locations = df_train['location'].unique()
    return locations

In [16]:
locations = get_locations('data/pollen_train.csv')

res = {}
batch_for_df = {}
errors = []

for loc in locations:
    print("Location: ", loc)
    X_loc, y_loc, X_test_loc = return_data_location(train_data, test_data, train_targets, loc, column_names)
    X_loc = add_prophet_features(X_loc)

    X_test_loc = add_prophet_features(X_test_loc)

    X_loc = handle_dates(X_loc, keep_date=True)
    X_test_loc = handle_dates(X_test_loc, keep_date=True)
    
    yhat1_loc, yhat2_loc, yhat3_loc, err = predict_three_days(X_loc, X_test_loc, y_loc)

    errors.append(err)

    bid = batch_id[loc]
    for i, b in enumerate(bid):
        batch_for_df[b] = [yhat1_loc[i], yhat2_loc[i], yhat3_loc[i]]
print("Mean error: ", np.mean(errors))

Location:  БЕОГРАД - НОВИ БЕОГРАД
MAE:  17.541888408330298
Location:  ВРШАЦ
MAE:  18.523563378927655
Location:  КРАГУЈЕВАЦ
MAE:  9.976562112993864
Location:  КРАЉЕВО
MAE:  9.417293545131836
Location:  НИШ
MAE:  4.958351522078163
Location:  ПОЖАРЕВАЦ
MAE:  15.803620694258328
Location:  СУБОТИЦА
MAE:  26.470173451223427
Mean error:  14.670207587563366


In [17]:
res_df = pd.DataFrame(batch_for_df).T
res_df.head()
res_df.sort_index(inplace=True)
res_df.columns = ['1 day prediction','2 days prediction','3 days prediction']
res_df.insert(0, 'batch_id', range(1, res_df.shape[0] + 1))

In [18]:
res_df.to_csv('results/submission.csv', index=False)