In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [78]:
dataset_path = '../data/'

pollen_test = pd.read_csv(dataset_path + 'pollen_test.csv')
pollen_train = pd.read_csv(dataset_path + 'pollen_train.csv')
pollen_type = pd.read_csv(dataset_path + 'pollen-type.csv')
submission_example = pd.read_csv(dataset_path + 'submission_example.csv')
weather_data = pd.read_csv(dataset_path + 'weather_data.csv')

In [79]:
pollen_train.drop(['Unnamed: 0', 'location'], axis=1, inplace=True)
pollen_test.drop(['Unnamed: 0', 'location'], axis=1, inplace=True)

In [80]:
pollen_train.shape

(11254, 27)

In [81]:
columns = pollen_train.columns.tolist()

In [82]:
pollen_test.head()

Unnamed: 0,date,ACER,ALNUS,AMBROSIA,ARTEMISIA,BETULA,CANNABACEAE,CARPINUS,CELTIS,CHENOP/AMAR.,...,PLATANUS,POACEAE,POPULUS,QUERCUS,RUMEX,SALIX,TILIA,ULMACEAE,URTICACEAE,batch_id
0,2022-02-14,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2022-02-15,0,5,0,0,6,0,0,0,0,...,0,0,6,0,0,0,0,13,0,1
2,2022-02-16,0,0,0,0,1,0,0,0,0,...,0,0,6,0,0,0,0,13,0,1
3,2022-02-17,0,6,0,0,11,0,0,0,0,...,0,0,22,0,0,0,0,30,0,1
4,2022-02-18,0,12,0,0,2,0,0,0,0,...,0,0,47,0,0,0,0,58,0,1


In [83]:
pollen_train.shape

(11254, 27)

In [84]:
# fill pollen_train with 0 if date is missing

date = pd.date_range(start='2016-01-01', end='2016-12-31')
date = date.strftime('%m-%d')
date = pd.DataFrame(date, columns=['date'])

# group pollen_train by date

pollen_train['date'] = pollen_train['date'].str[5:]
pollen_train = pollen_train.groupby('date').mean().reset_index()

# merge date and pollen_train
pollen_train = pollen_train.merge(date, on='date', how='left')
pollen_train = pollen_train.fillna(0)

# group pollen_test by batch_id

test_dates = [pollen_test['date'][0][5:] for i in range(0, len(pollen_test), 10)]
pollen_test = pollen_test.groupby('batch_id').mean().reset_index()
pollen_test['date'] = test_dates

In [85]:
# if date doesn't exist in pollen_train, fill with 0

pollen_train = pollen_train.merge(date, on='date', how='left')
pollen_train = pollen_train.fillna(0)

In [86]:
import warnings
import copy
warnings.filterwarnings('ignore')

In [87]:
# Make new dataset row consisting of 10 consecutive dates in pollen_train

new_pollen_train = pd.DataFrame()
new_pollen_val = pd.DataFrame()
new_pollen_test = pd.DataFrame()

# iterate trough pollen_train
for i in range(0, len(pollen_train)-11, 1):
    if i % 100 == 0:
        print("Batch: ", i)

    # get 10 consecutive rows
    train_rows = copy.deepcopy(pollen_train.iloc[i:i+10])
    start_date = train_rows.iloc[0]['date']
    train_rows.drop(['date'], axis=1, inplace=True)
    train_rows_mean = train_rows.mean()
    train_rows_mean['date'] = start_date

    new_pollen_train = new_pollen_train.append(train_rows_mean, ignore_index=True)

    val_rows = copy.deepcopy(pollen_train.iloc[i+10:i+11])
    new_pollen_val = new_pollen_val.append(val_rows, ignore_index=True)


Batch:  0


Batch:  100
Batch:  200


In [88]:
new_pollen_train[230:240]

Unnamed: 0,ACER,ALNUS,AMBROSIA,ARTEMISIA,BETULA,CANNABACEAE,CARPINUS,CELTIS,CHENOP/AMAR.,CORYLUS,...,PLATANUS,POACEAE,POPULUS,QUERCUS,RUMEX,SALIX,TILIA,ULMACEAE,URTICACEAE,date
230,0.0,0.0,69.942799,3.425261,0.0,0.963182,0.0,0.0,3.874623,0.0,...,0.021835,5.039605,0.0,0.0,0.064576,0.0,0.033624,0.0,5.371951,09-12
231,0.0,0.0,60.485714,3.441521,0.0,0.852671,0.0,0.0,3.524042,0.0,...,0.021835,4.547967,0.0,0.0,0.031475,0.0,0.02892,0.0,4.712485,09-13
232,0.0,0.002381,52.121429,3.48676,0.0,0.747909,0.0,0.0,3.288328,0.0,...,0.021835,4.243206,0.0,0.0,0.036237,0.0,0.02892,0.0,3.979152,09-14
233,0.0,0.002381,43.392857,3.451045,0.0,0.705052,0.0,0.0,2.978804,0.0,...,0.021835,3.87892,0.0,0.0,0.045761,0.0,0.031301,0.0,3.383914,09-15
234,0.0,0.002381,35.097619,3.420093,0.0,0.616957,0.0,0.0,2.497851,0.0,...,0.017073,3.359872,0.0,0.0,0.04338,0.0,0.021777,0.0,2.769628,09-16
235,0.0,0.002381,28.001278,3.313763,0.0,0.489837,0.0,0.0,2.199768,0.0,...,0.017073,2.902323,0.0,0.0,0.045761,0.0,0.024158,0.0,2.21597,09-17
236,0.0,0.002381,23.51626,3.3277,0.0,0.44489,0.0,0.0,1.948664,0.0,...,0.012195,2.554181,0.0,0.0,0.045703,0.0,0.01928,0.0,1.840244,09-18
237,0.0,0.002381,19.2223,3.216144,0.0,0.38101,0.0,0.0,1.730836,0.0,...,0.004878,2.199129,0.0,0.0,0.047967,0.0,0.024042,0.0,1.518525,09-19
238,0.0,0.002381,16.795528,3.169919,0.0,0.368002,0.0,0.0,1.622938,0.0,...,0.002381,2.068467,0.0,0.0,0.05511,0.0,0.021545,0.0,1.308537,09-20
239,0.0,0.002381,15.25842,3.060569,0.0,0.331069,0.0,0.0,1.544599,0.0,...,0.002381,1.91144,0.0,0.0,0.055052,0.0,0.019106,0.0,1.227758,09-21


In [89]:
new_pollen_val.head()

Unnamed: 0,date,ACER,ALNUS,AMBROSIA,ARTEMISIA,BETULA,CANNABACEAE,CARPINUS,CELTIS,CHENOP/AMAR.,...,PLANTAGO,PLATANUS,POACEAE,POPULUS,QUERCUS,RUMEX,SALIX,TILIA,ULMACEAE,URTICACEAE
0,02-05,0.066667,2.6,0.166667,0.0,0.166667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.066667,0.0,0.0,0.033333,0.0,2.6,0.0
1,02-06,0.0,1.444444,0.055556,0.0,1.5,0.0,0.055556,0.0,0.0,...,0.0,0.0,0.0,0.055556,0.0,0.0,0.166667,0.027778,3.027778,0.0
2,02-07,0.108108,3.72973,0.054054,0.0,5.378378,0.0,0.027027,0.0,0.0,...,0.0,0.0,0.0,0.054054,0.0,0.0,0.135135,0.0,7.891892,0.0
3,02-08,0.194444,2.861111,0.0,0.0,0.777778,0.0,0.027778,0.0,0.0,...,0.0,0.0,0.0,0.444444,0.0,0.0,0.027778,0.0,4.805556,0.0
4,02-09,0.083333,6.166667,0.055556,0.0,0.472222,0.0,0.083333,0.0,0.027778,...,0.0,0.0,0.0,0.833333,0.027778,0.0,0.0,0.0,5.194444,0.0


In [90]:
pollen_test.head()

Unnamed: 0,batch_id,ACER,ALNUS,AMBROSIA,ARTEMISIA,BETULA,CANNABACEAE,CARPINUS,CELTIS,CHENOP/AMAR.,...,PLATANUS,POACEAE,POPULUS,QUERCUS,RUMEX,SALIX,TILIA,ULMACEAE,URTICACEAE,date
0,1,1.2,4.6,0.0,0.0,6.2,0.0,0.3,0.0,0.0,...,0.0,0.0,43.9,0.0,0.0,0.9,0.0,35.3,0.0,02-14
1,2,0.1,62.9,0.0,0.0,24.0,0.0,6.4,0.0,0.0,...,0.0,0.0,11.4,0.0,0.0,0.0,0.0,14.9,0.0,02-14
2,3,0.0,5.4,0.0,0.0,3.8,0.0,0.0,0.0,0.0,...,0.0,0.0,6.0,0.0,0.0,0.0,0.0,23.5,0.0,02-14
3,4,0.0,40.8,0.0,0.0,22.4,0.0,1.7,0.0,0.0,...,0.0,0.0,4.8,0.0,0.0,5.3,0.0,17.5,0.0,02-14
4,5,0.0,20.2,0.0,0.0,0.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.4,0.0,02-14


In [91]:
# remove year from date

pollen_test['date'] = pollen_test['date'].apply(lambda x: x[5:])

pollen_test.head()

Unnamed: 0,batch_id,ACER,ALNUS,AMBROSIA,ARTEMISIA,BETULA,CANNABACEAE,CARPINUS,CELTIS,CHENOP/AMAR.,...,PLATANUS,POACEAE,POPULUS,QUERCUS,RUMEX,SALIX,TILIA,ULMACEAE,URTICACEAE,date
0,1,1.2,4.6,0.0,0.0,6.2,0.0,0.3,0.0,0.0,...,0.0,0.0,43.9,0.0,0.0,0.9,0.0,35.3,0.0,
1,2,0.1,62.9,0.0,0.0,24.0,0.0,6.4,0.0,0.0,...,0.0,0.0,11.4,0.0,0.0,0.0,0.0,14.9,0.0,
2,3,0.0,5.4,0.0,0.0,3.8,0.0,0.0,0.0,0.0,...,0.0,0.0,6.0,0.0,0.0,0.0,0.0,23.5,0.0,
3,4,0.0,40.8,0.0,0.0,22.4,0.0,1.7,0.0,0.0,...,0.0,0.0,4.8,0.0,0.0,5.3,0.0,17.5,0.0,
4,5,0.0,20.2,0.0,0.0,0.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.4,0.0,


## Save CSV

In [92]:
# save new_pollen_train and new_pollen_test

new_pollen_train.to_csv(dataset_path + 'new_pollen_train.csv', index=False)
new_pollen_val.to_csv(dataset_path + 'new_pollen_val.csv', index=False)
pollen_test.to_csv(dataset_path + 'new_pollen_test.csv', index=False)