In [1]:
import numpy as np
import pandas as pd
from scipy.signal import savgol_filter
from scipy.interpolate import UnivariateSpline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
import math
from fbprophet import Prophet
import matplotlib.pyplot as plt

In [2]:
def getStates():
    ds = pd.read_csv('data/test.csv')
    states = ds['Province_State'][:50].values
    return states

In [3]:
def distance(x, y, side_check=False):
    if side_check:
        if y > x:
            return 0
    b = y + x
    i = b / 2
    return math.sqrt((x - i)**2 + (y - i)**2)

In [4]:
def all_distance(data, side_check=False):
    rtn = []
    for i in range(len(data)):
        rtn.append(distance(i, data[i], side_check=side_check))
    return rtn

In [5]:
def standardization(data):
    mu = np.mean(data, axis=0)
    sigma = np.std(data, axis=0)
    return (data - mu) / sigma

In [6]:
# ============= DEATH Training ============

degree = 3
states = getStates()
start = 213
alpha = 5.2
window = 25
states2idx = {}
for i in range(len(states)):
    states2idx[states[i]] = i

In [7]:
feature = 'Deaths'
res = []
total = 0

for i in range(len(states)):
    state = states[i]
    ds = pd.read_csv('data/train_round2.csv')
    ds = ds[ds['Province_State'] == state]
    raw = ds[feature].values
    value = savgol_filter(raw.reshape(-1), window, degree) #smooth data

###===================================================================###
### no saturation no negative slope ###
    scale = raw / raw[-1] * 225
    distances = all_distance(scale)
    diff = sum(distances[-30:])
    max_point = distances.index(max(distances))

    x = [[i] for i in range(len(value))]

    y = np.array([value[start:]]).reshape(-1, 1)
    x = [[i + start] for i in range(225-start)]
    model = Ridge(alpha=alpha)
    model.fit(x, y)
    x_test = [[i + 225] for i in range(21)]
    y_hat = model.predict(x_test)

    slope = 21 / (y_hat[-1] - y_hat[0])
    smooth_slope = 1 / (value[-1] - value[-2])

    diff = abs(smooth_slope - slope)
    if diff > 0.5:
        y = np.array([value[start - 28:]]).reshape(-1, 1)
        x = [[i + start - 28] for i in range(225 - start + 28)]
        model = Ridge(alpha=alpha)
        model.fit(x, y)
        y_hat = model.predict(x_test)

        sub = 0.1 * smooth_slope + 0.9 * slope
        cur_slope = 21 / (y_hat[-1] - y_hat[0])
        y_hat = (y_hat-y_hat[0]) / cur_slope * sub + y_hat[0]

    res.append(y_hat)

In [8]:
rerange = []
for i in range(len(res[0])):
    for j in range(len(res)):
        rerange.append(res[j][i])
#print(rerange)
df = pd.DataFrame(rerange)
df.to_csv("output_round2/{}.csv".format(feature),index=False,sep=',')
Death_df = pd.DataFrame(rerange)

In [9]:
# ============= CONFIRM Training ============

In [10]:
degree = 3
states = getStates()
start = 219
alpha = 3.2
window = 23
states2idx = {}
for i in range(len(states)):
    states2idx[states[i]] = i

In [11]:
feature = 'Confirmed'
res = []
total = 0

for i in range(len(states)):
    state = states[i]
    ds = pd.read_csv('data/train_round2.csv')
    ds = ds[ds['Province_State'] == state]
    raw = ds[feature].values
    value = savgol_filter(raw.reshape(-1), window, degree) #smooth data

###===================================================================###
    scale = raw / raw[-1] * 225
    distances = all_distance(scale)
    diff = sum(distances[-30:])
    max_point = distances.index(max(distances))

    test = pd.read_csv('data/datatest_round2.csv')
    test = test[test['Province_State'] == state]
    real = test[feature].values


    y = np.array([value[start:]]).reshape(-1, 1)
    x = [[i + start] for i in range(225-start)]
    model = Pipeline([
        ("poly", PolynomialFeatures(degree=1)),
        ("lasso_reg", Ridge(alpha=alpha))
    ])
    model.fit(x, y)
    x_test = [[i + 225] for i in range(21)]
    y_hat = model.predict(x_test)

# ###===================================================================###
    move_in = pd.read_csv('data/move_in_data_round2.csv')
    move_out = pd.read_csv('data/move_out_data_round2.csv')

    idx = states2idx[state]

    move_in = move_in.iloc[[idx]].values[0][2:]
    move_out = move_out.iloc[[idx]].values[0][2:]

    smooth_move_in = savgol_filter(move_in.reshape(-1), 11, 3)
    smooth_move_out = savgol_filter(move_out.reshape(-1), 11, 3)

    plt.show()
    move_diff = move_in - move_out
    smooth_move_diff = smooth_move_in - smooth_move_out

    std_move_diff = standardization(move_diff)
    std_smooth_move_diff = standardization(smooth_move_diff)

    fst_mean = np.mean(std_move_diff[-28:-21])
    snd_mean = np.mean(std_move_diff[-14:-7])
    if abs(fst_mean - snd_mean) > 1:
        value = savgol_filter(value.reshape(-1), window, degree)
        y = np.array([value[start - 7:]]).reshape(-1, 1)
        x = [[i + start - 7] for i in range(225 - start + 7)] # double training set
        model = Ridge(alpha=alpha)
        model.fit(x, y)
        x_test = [[i + 225] for i in range(21)]
        y_hat = model.predict(x_test)

# ###===================================================================###
    res.append(y_hat)

In [12]:
rerange = []
for i in range(len(res[0])):
    for j in range(len(res)):
        rerange.append(res[j][i])
#print(rerange)
df = pd.DataFrame(rerange)
df.to_csv("output_round2/{}.csv".format(feature),index=False,sep=',')
Confirm_df = pd.DataFrame(rerange)

In [13]:
submission_df = pd.DataFrame({"ForecastID" : np.arange(21*50)})
submission_df = pd.concat([submission_df, Confirm_df, Death_df], axis = 1)
submission_df.columns = ["ForecastID", "Confirmed", "Deaths"]

In [14]:
submission_df

Unnamed: 0,ForecastID,Confirmed,Deaths
0,0,2.339492e+05,3498.276983
1,1,2.804430e+04,105.404668
2,2,3.006775e+05,6479.172783
3,3,1.459886e+05,2372.585536
4,4,1.121545e+06,18753.999430
...,...,...,...
1045,1045,2.565172e+05,4322.572763
1046,1046,1.736147e+05,2896.887567
1047,1047,5.876896e+04,876.712715
1048,1048,4.946568e+05,4427.386921


In [15]:
formatted_df = submission_df.copy()
formatted_df.index = submission_df['ForecastID']
formatted_df.drop(columns='ForecastID', inplace=True)
print(formatted_df)
formatted_df.to_csv('round2_all.csv')

               Confirmed        Deaths
ForecastID                            
0           2.339492e+05   3498.276983
1           2.804430e+04    105.404668
2           3.006775e+05   6479.172783
3           1.459886e+05   2372.585536
4           1.121545e+06  18753.999430
...                  ...           ...
1045        2.565172e+05   4322.572763
1046        1.736147e+05   2896.887567
1047        5.876896e+04    876.712715
1048        4.946568e+05   4427.386921
1049        4.208872e+04    282.938675

[1050 rows x 2 columns]


In [16]:
round2_df = pd.read_csv('data/test_round2.csv')
last_confirm_ser = submission_df.iloc[700:, 1].values
last_death_ser = submission_df.iloc[700:, 2].values
round2_df['Confirmed'] = last_confirm_ser
round2_df['Deaths'] = last_death_ser
# round2_df.columns = ["ForecastID", "Confirmed", "Deaths"]
formatted_df = round2_df.copy()
formatted_df.index = round2_df['ForecastID']
formatted_df.drop(columns='ForecastID', inplace=True)
formatted_df.drop(columns='Province_State', inplace=True)
formatted_df.drop(columns='Date', inplace=True)
print(formatted_df)
formatted_df.to_csv('round2.csv')

               Confirmed        Deaths
ForecastID                            
0           2.616859e+05   3894.000783
1           3.476201e+04    150.397059
2           3.446915e+05   6786.960757
3           1.666222e+05   2666.533191
4           1.272321e+06  19543.340485
...                  ...           ...
345         2.565172e+05   4322.572763
346         1.736147e+05   2896.887567
347         5.876896e+04    876.712715
348         4.946568e+05   4427.386921
349         4.208872e+04    282.938675

[350 rows x 2 columns]
