In [38]:
import numpy as np
import pandas as pd
import csv 
import os
import pickle
from collections import defaultdict
from sklearn import preprocessing

from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

In [2]:
def load_labels(filename):
    ret = {}
    val = []
    with open(filename, newline='', encoding = 'utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            region, start_date, end_date = row['Region'], row['Start Date'], row['End Date']
            if row['Deliveries'] != '':
                d = float(row['Deliveries'])
            else:
                d = 0
            if row['Prices'] != '':
                p = float(row['Prices'])
            else:
                p = 0
            ret[(region, start_date, end_date)] = (d, p)
            val.append([d,p])
    return ret, val

_, pred = load_labels("baseline.csv")

In [19]:
def write2submission(pred):
    sol = open("solution/solution.csv", newline='', encoding='utf-8',mode='w')
    sam = open("sample.csv", newline='', encoding='utf-8',mode='r')
    bl = open("baseline.csv", newline='', encoding='utf-8',mode='r')
    bl.readline()
    sol.write(sam.readline())
    con = sam.readlines()
    con_bl = bl.readlines()
    for ind, row in enumerate(con):
        row = row.strip().split(',')
        row_bl = con_bl[ind].strip().split(',')
        row[3] = str(max(pred[ind][0],0))
        row[4] = row_bl[4]
        #row[4] = str(pred[ind][1])
        sol.write(",".join(row) + "\n")
    sol.close()
    sam.close()
    bl.close()



In [40]:
class CanolaData(object):
    def __init__(self, loadfile):
        self.region_startdate = {}
        self.region_startdate["Alberta"] = 2001
        self.region_startdate["British Columbia"] = 2001
        self.region_startdate["Manitoba"] = 2001
        self.region_startdate["Ontario"] = 2001
        self.region_startdate["Saskatchewan"] = 2001
        self.region_startdate["Québec"] = 2001
        self.region_list = ["Alberta", "British Columbia", "Manitoba", "Ontario", "Québec", "Saskatchewan"]
        self.region_onthot = {}
        self.region_onthot["Alberta"] = [1,0,0,0,0,0]
        self.region_onthot["British Columbia"] = [0,1,0,0,0,0]
        self.region_onthot["Manitoba"] = [0,0,1,0,0,0]
        self.region_onthot["Ontario"] = [0,0,0,1,0,0]
        self.region_onthot["Saskatchewan"] = [0,0,0,0,1,0]
        self.region_onthot["Québec"] = [1,0,0,0,0,1]
        self.season_onthot = {}
        self.season_onthot[1] = [0,0,0,1]
        self.season_onthot[2] = [0,0,0,1]
        self.season_onthot[3] = [0,0,0,1]
        self.season_onthot[4] = [0,0,0,1]
        self.season_onthot[5] = [1,0,0,0]
        self.season_onthot[6] = [0,1,0,0]
        self.season_onthot[7] = [0,1,0,0]
        self.season_onthot[8] = [0,0,1,0]
        self.season_onthot[9] = [0,0,1,0]
        self.season_onthot[10] = [0,0,1,0]
        self.season_onthot[11] = [0,0,0,1]
        self.season_onthot[12] = [0,0,0,1]

        if loadfile is False:
            self.delivery = self.targetDataLoader("Challenge_Data/1. Target Variables/Canada_Canola_Producer_Deliveries.csv")
            self.price = self.targetDataLoader("Challenge_Data/1. Target Variables/Canada_Canola_Producer_Prices.csv")
            self.harvested_area = self.yearlyDataLoader("Challenge_Data/2. Other Canola Production Data/Canada_Canola_Harvested_Area.csv")
            self.yield_value = self.yearlyDataLoader("Challenge_Data/2. Other Canola Production Data/Canada_Canola_Yield.csv")
            self.rainfall_data = self.rainfallDataLoader()
            self.temp_data = self.temperatureDataLoader()
            save_dict = {"delivery": self.delivery,
                        "price": self.price,
                        "harvested_area": self.harvested_area,
                        "yield_value": self.yield_value,
                        "rainfall_data": self.rainfall_data,
                        "temp_data": self.temp_data}
            with open("dataprepared.pkl","wb") as f:
                pickle.dump(save_dict, f)
        
        else:
            with open("dataprepared.pkl","rb") as f:
                savedict = pickle.load(f)
            self.delivery = savedict["delivery"]
            self.price = savedict["price"]
            self.harvested_area = savedict["harvested_area"]
            self.yield_value = savedict["yield_value"]
            self.rainfall_data = savedict["rainfall_data"]
            self.temp_data = savedict["temp_data"]


    def deliveryDataCombine(self, history_length=2, for_train=True, harvested=False, yields=False, rainfall=False, temp=False):
        start_year = np.max(list(self.region_startdate.values())) + history_length
        data = []
        for year in range(start_year, 2019):
            for region in self.region_list:
                region_feature = self.region_onthot[region]
                for month in range(12):
                    season_feature = self.season_onthot[month+1]
                    x_delivery = []
                    for k in range(1, history_length+1):
                        x_delivery.append(self.delivery[region][year-k][month])
                    x_delivery.extend(self.rainfall_data[region][year][month][:2])
                    x_delivery.extend(self.temp_data[region][year][month][:2])
                    x_delivery.extend(region_feature)
                    x_delivery.extend(season_feature)
                    y_delivery = self.delivery[region][year][month]
                    data.append([x_delivery, y_delivery])
        
        if for_train:
            return data[:-72], data[-72:]
        else:
            return data
        
    def to_predict(self, history_length):
        data = []
        year = 2019
        for region in self.region_list:
            region_feature = self.region_onthot[region]
            for month in range(12):
                season_feature = self.season_onthot[month+1]
                x_delivery = []
                for k in range(1, history_length+1):
                    x_delivery.append(self.delivery[region][year-k][month])
                x_delivery.extend(self.rainfall_data[region][year][month][:2])
                x_delivery.extend(self.temp_data[region][year][month][:2])
                x_delivery.extend(region_feature)
                x_delivery.extend(season_feature)
                data.append(x_delivery)
        return data
    
    def targetDataLoader(self, filename):
        data = pd.read_csv(filename)
        ret = defaultdict(dict)
        for region in tqdm(self.region_startdate):
            y = self.region_startdate[region]
            for year in range(y, 2019):
                data4year = []
                for month in range(1,13):
                    date = str(year) + "-" + "{:0>2d}".format(month)
                    v = data[(data["Region"] == region) & 
                            (data["Start Date"].str.contains(date))]["Value"].values
                    try:
                        data4year.append(float(v))
                    except:
                        data4year.append(0.0)
                ret[region][year] = data4year
        return ret

    def yearlyDataLoader(self, filename):
        data = pd.read_csv(filename)
        ret = defaultdict(dict)
        for region in tqdm(self.region_startdate):
            y = self.region_startdate[region]
            for year in range(y, 2019):
                v = data[(data["Region"] == region) & 
                        (data["Start Date"].str.contains(str(year)))]["Value"].values
                try:
                    ret[region][year] = float(v)
                except:
                    ret[region][year] = 0
        return ret

    def rainfallDataLoader(self):
        '''if a month of data is missing, replace it with the data from \n 
        the same region, same month of the previous year'''
            
        dirname = "Challenge_Data/4. Spatio-Temporal & Weather Data/Rainfall/"
        ret = defaultdict(dict)
        for name in tqdm(os.listdir(dirname)):
            filename = dirname + name
            year = int((name.strip().split('.')[0]).split('_')[-1])
            if year == 2000 or year == 2020:
                continue
            
            data = pd.read_csv(filename)
            for region in self.region_startdate:
                ret[region][year] = []
                for month in range(1, 13):
                    date = str(year) + "-" + "{:0>2d}".format(month)
                    monthly_detail = np.array(data[(data["start_date"].str.contains(date)) &
                                                (data["region_name"] == region)]["value"])
                    try:
                        ret[region][year].append([np.mean(monthly_detail), np.std(monthly_detail),
                                            np.min(monthly_detail), np.max(monthly_detail)])
                    except:
                        print (region, date)
                        ret[region][year].append(ret[region][year-1][month-1])
        return ret

    def temperatureDataLoader(self):
        '''if a month of data is missing, replace it with the data from \n 
        the same region, same month of the previous year'''
        
        dirname = "Challenge_Data/4. Spatio-Temporal & Weather Data/Temperature/"
        ret = defaultdict(dict)
        for name in tqdm(os.listdir(dirname)):
            filename = dirname + name
            year = int((name.strip().split('.')[0]).split('_')[-1])
            if year == 2000 or year == 2020:
                continue
            
            data = pd.read_csv(filename)
            for region in self.region_startdate:
                ret[region][year] = []
                for month in range(1, 13):
                    date = str(year) + "-" + "{:0>2d}".format(month)
                    monthly_detail = np.array(data[(data["start_date"].str.contains(date)) &
                                                (data["region_name"] == region)]["value"])
                    monthly_detail = monthly_detail[~np.isnan(monthly_detail)]

                    try:
                        ret[region][year].append([np.mean(monthly_detail), np.std(monthly_detail),
                                            np.min(monthly_detail), np.max(monthly_detail)])
                    except:
                        print (region, date)
                        ret[region][year].append(ret[region][year-1][month-1])
        return ret

In [5]:
def score(pred, gt):
    num = 0
    relative = 0.0
    for (p,y) in zip(pred, gt):
        if y == 0:
            continue
        num += 1
        relative += (abs(p-y)/y)
    relative /= num
    return max(0, 1-relative)*100

In [41]:
dataObj = CanolaData(loadfile=True)
deliTrain, deliVal = dataObj.deliveryDataCombine(history_length=2, for_train=True)

In [42]:
X_test = np.array(dataObj.to_predict(history_length=2))

In [76]:
X_train = np.array([item[0] for item in deliTrain])
y_train = np.array([item[1] for item in deliTrain]).reshape((-1,1))
X_val = np.array([item[0] for item in deliVal])
y_val = np.array([item[1] for item in deliVal]).reshape((-1,1))


In [77]:
X_train.shape

(1080, 16)

In [78]:
import torch
import torch.nn as nn
from torch.autograd import Variable

class CanolaModel(nn.Module):
    def __init__(self, history_length):
        super(CanolaModel, self).__init__()
        self.fc1 = nn.Linear(12, 1)
        nn.init.xavier_normal_(self.fc1.weight)
    def forward(self, X):
        out = self.fc1(X)
        return out

In [79]:
naive = CanolaModel(history_length=2)
naive = naive.float()
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(naive.parameters(), lr=0.05)


In [80]:
num_epoch = 100
for epoch in tqdm(range(num_epoch)):
    ids = np.random.permutation(X_train.shape[0])
    epoch_loss = 0.0
    for sample_id in ids:
        x = Variable(torch.from_numpy(X_train[sample_id,:12]))
        y = Variable(torch.from_numpy(y_train[sample_id]))
        
        optimizer.zero_grad()
        pred = naive(x.float())
        loss = criterion(pred, y.float())
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
    print (epoch_loss/X_train.shape[0])

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

48950.07626922153
47539.92213928556
48736.28044561236
46894.76654193332
45947.11791893663
49629.3022090658
47214.765831112585
46745.70945133826
46920.8626319563
48271.41243671874
47337.82687774919
47691.08692140668
46916.20053695772
46968.43156859477
48581.726978348364
47283.14139357628
47488.2237428089
46758.276990996026
47862.18926283209
46151.26597486204
47819.80302438692
48344.650848761754
47716.34505469534
46342.57045190732
50939.11621737348
47473.42014433543
46949.19575658904
46555.73072788627
48104.752873673264
49025.46673854545
47199.51150505631
46962.83974291042
49058.99042669667
48404.35348025251
46871.374667774304
48794.8685802495
49025.08580074222
46608.828720798316
48056.739418603756
49148.09001562684
46729.320519996574
47862.86631515556
45897.12979082708
46907.54721166469
47886.87618267624
47364.27599418075
47503.27375943925
47894.1557184449
47139.32233885659
48397.49051264481
46754.46360560523
47314.32038828532
48027.23397646304
47068.24374417023
48393.928902732005
48463

In [81]:
naive.fc1.weight

Parameter containing:
tensor([[ 8.2749e-01,  2.8612e-01, -1.8275e+01,  4.3046e+01, -1.1449e+00,
          1.1786e+01,  2.1354e+01, -9.2915e+01, -6.3722e+01, -1.6358e+02,
          3.7508e+02, -1.2444e+02]], requires_grad=True)

In [82]:
y_pre = naive(Variable(torch.from_numpy(X_val[:,:12])).float())
score(y_pre.detach().numpy().flatten(), y_val.flatten())

57.08742318521516

In [59]:
y_pre = naive(Variable(torch.from_numpy(X_test[:,:12])).float())
#score(y_pre.detach().numpy().flatten(), y_val.flatten())

In [60]:
write2submission(y_pre.detach().numpy())

In [280]:
with open("model/3_4_4.pkl", 'wb') as f:
    pickle.dump(naive, f)

In [281]:
y_pre = naive(Variable(torch.from_numpy(X_train[-72:,:11])).float())
score(y_pre.detach().numpy().flatten(), y_train[-72:].flatten())

31.352216419170066

### delivery baseline for 2018

In [47]:
relative_error = 0.0
num = 0
for region in region_list:
    for month in range(12):
        if dataObj.delivery[region][2018][month] == 0:
            continue
        error = abs(dataObj.delivery[region][2018][month] - dataObj.delivery[region][2017][month])/dataObj.delivery[region][2018][month]  
        relative_error += error
        num += 1
relative_error /= num
score = max(0, 1-relative_error) * 100
print (score)

57.375134469278066
