In [76]:
import xgboost as xgb
import pandas as pd
import numpy as np
from utils import *

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, make_scorer, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV

from prophet.diagnostics import cross_validation, performance_metrics
from prophet.serialize import model_to_json, model_from_json

In [77]:
def get_pred_from_prophet(dates):
    dataset_path = "data/"

    with open('models/prophet.json', 'r') as fin:
        m = model_from_json(fin.read())  # Load model

    future = pd.DataFrame({'ds': dates})
    future['floor'] = 0
    future['cap'] = 2500

    forecast = m.predict(future)

    ans = []
    for forecast_row in forecast.itertuples():
        val = int(forecast_row.yhat)
        val = max(0, val)
        ans.append(val)

    return ans

In [78]:
def get_model(model_name):

    if model_name == 'logistic_regression':
        return LogisticRegression()
    
    elif model_name == 'svm':
        return SVC()
    
    elif model_name == 'random_forest':
        return RandomForestClassifier()
    
    elif model_name == 'xgboost':
        return xgb.XGBRegressor()
    
    elif model_name == 'naive_bayes':
        return GaussianNB()
    
    else:
        raise ValueError(f'Unknown model name: {model_name}')

In [98]:
def handle_dates(df, keep_date=False):
    date_columns = ['2', '31', '60', '89', '118', '147', '176', '205', '234', '263']

    # for col in date_columns:
    #     dates = list(df[col])
    #     df['prophet_' + col] = get_pred_from_prophet(dates)

    if keep_date:
        for col in date_columns:
            col = int(col)
            df[col] = pd.to_datetime(df[col])
            # df['day'] = df[col].dt.day
            df['month'] = df[col].dt.month
            # df['year'] = df['date'].dt.year
            df.drop([col], axis=1, inplace=True)
    else:
        for col in date_columns:
            col = int(col)
            df.drop([col], axis=1, inplace=True)

    return df

In [99]:
name = "xgboost"
dataset_path = "data/"

X_train, y, X_test = load_train_test(dataset_path + "pollen_train.csv", dataset_path + "pollen_test.csv")

In [111]:
y['БЕОГРАД - НОВИ БЕОГРАД'][0]

(0.0, 0.0, 0.0)

In [106]:
X_test_bg = pd.DataFrame(X_test['БЕОГРАД - НОВИ БЕОГРАД'])
X_test_bg.drop(columns=[0, 1], inplace=True, axis=1)
X_test_bg.head()

X_train_bg = pd.DataFrame(X_train['БЕОГРАД - НОВИ БЕОГРАД'])
X_train_bg.drop(columns=[0, 1], inplace=True, axis=1)
X_train_bg.head()

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,280,281,282,283,284,285,286,287,288,289
0,2016-02-02,0,0,0,0,0,0,0,0,0,...,0,0,0,6,0,0,1,0,15,0
1,2016-02-03,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,19,0
2,2016-02-04,0,0,0,0,0,0,0,0,0,...,0,0,0,18,0,0,0,0,71,0
3,2016-02-05,0,2,0,0,0,0,0,0,0,...,0,0,0,56,0,0,0,0,97,0
4,2016-02-06,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,76,0


In [107]:
X_test_bg = handle_dates(X_test_bg, keep_date=True)
X_train_bg = handle_dates(X_train_bg, keep_date=True)

In [108]:
model = get_model(name)
model.fit(X_train_bg, y['0'])
yhat1 = model.predict(X_test)
yhat1[yhat1 < 0] = 0

KeyError: '0'

In [44]:
def predict_three_days(X, X_test, y):
    model = get_model(name)
    model.fit(X, y['0'])
    yhat1 = model.predict(X_test)
    yhat1[yhat1 < 0] = 0


    X = X.iloc[:, 29:]
    X['day11'] = yhat1

    model = get_model(name)
    model.fit(X, y['1'])
    yhat2 = model.predict(X_test)
    yhat2[yhat2 < 0] = 0


    X = X.iloc[:, 29:]
    X['day12'] = yhat2

    model = get_model(name)
    model.fit(X, y['2'])
    yhat3 = model.predict(X_test)
    yhat3[yhat3 < 0] = 0

    return yhat1, yhat2, yhat3