In [None]:
# Imports
import numpy as np
import pandas as pd
import math
import importlib as imp
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.losses import Huber

In [None]:
# User module imports
from utils import district_daily_data as dd
dd = imp.reload(dd)

In [None]:
# Flags
linear_reg = False
sv_reg = True
episodes = True
lstm_model = True

In [None]:
# Directory variables
data_dir = 'data/'

In [None]:
# Read state data
df_state = pd.read_csv(data_dir + 'state-date-total-data.csv')
arr_state = df_state.to_numpy() # still reversed
arr_state = np.flipud(arr_state) # now taken data from day-1 to day-52; but still daily cases
arr_state = np.cumsum(arr_state, axis=0) # now cumulative cases till day 52
np.savetxt(data_dir + 'state-date-total-data-cumulative.csv', arr_state.astype(int), fmt='%i', delimiter=",")

In [None]:
# Read population density data
df_population_density = pd.read_csv(data_dir + 'district_wise_population_density.csv')
np_population_density = df_population_density.to_numpy() 
data_found_count = 0 # no of districts for which we have population density data

def get_district_population_density(d):
    global data_found_count
    dist_pop_density = -2
    for i_cn in range(len(np_population_density)):
        if(np_population_density[i_cn][1].lower().count(d.lower().strip()) > 0):
            dist_pop_density = max(float(np_population_density[i_cn][7]), dist_pop_density)
    if(dist_pop_density <= 0): # print(d) # district not matched || area not found || population data missing
        dist_pop_density = 368 # population density of INDIA
    else:
        data_found_count = data_found_count + 1
    return dist_pop_density

In [None]:
# Read district data
districts = dd.get_all_districts()
dist_series = []  # [(start_date, series), (start_date, series), ...]
max_number = 0

# Note: start_date might itself be a feature
for d in districts:
    if d == "Mumbai" or d == "Thane" or d == "Delhi" or d == "Lucknow":
        d_start_date = dd.get_infection_start(d)
        district_pop_density = get_district_population_density(d)
        district_time_series = dd.get_district_time_series(d, d_start_date)
        district_max = max(district_time_series)
        
        dist_series.append((d_start_date, district_time_series, district_pop_density))
        if district_max > max_number:
            max_number = district_max
        
print("data_found_count:", data_found_count,  " tot dists:", len(districts))

In [None]:
feature_range = 5

# Transform using MinMaxScaler
def fit_transform(series):
    global max_number
    series = np.array(list(map(lambda x: x/max_number*feature_range, series)))
    return series

# Revert the transform to get actual series
def inverse_transform(series):
    global max_number
    series = np.array(list(map(lambda x: np.rint(x*max_number/feature_range), series)))
    return series

In [None]:
# Get separate train and test sets with data points from each of the districts
def divide_series(dist_series, train_percent, look_ahead=1):
    # Construct train and test data and fit Support Vector Regression
    x_train = []
    x_test = []
    y_train = []
    y_test = []
    episode_length = 14
    count = 0
    for tup in dist_series:
        series = tup[1]
        a = np.array(series)
        series = a.reshape(a.shape[0], 1)
        series = fit_transform(series)

        dist_pop_density = tup[2]
        num_episodes = len(series) - episode_length + 1
        if num_episodes < 2: continue

        dist_x = []
        dist_y = []
        for _in in range(num_episodes-look_ahead):
            dist_x.append(series[_in:_in+episode_length])
            dist_y.append(series[_in+episode_length+look_ahead-1])
        
        train_length = int(train_percent*len(dist_x))
        x_train.extend(dist_x[:train_length])
        y_train.extend(dist_y[:train_length])
        x_test.extend(dist_x[train_length:-1])
        y_test.extend(dist_y[train_length:-1])

    x_train = np.array(x_train)
    x_test = np.array(x_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    print (x_train.shape, x_test.shape)
    return (x_train, y_train), (x_test, y_test)

In [None]:
if lstm_model:
    train, test = divide_series(dist_series, 0.67, look_ahead=6)

    # reshape input to be [samples, time steps, features]
    np.random.seed(7)
    look_back = 14
    trainX = np.reshape(train[0], (train[0].shape[0], train[0].shape[1], 1))
    testX = np.reshape(test[0], (test[0].shape[0], test[0].shape[1], 1))

    # create and fit the LSTM network
    model = Sequential()
    # model.add(LSTM(2, input_shape=(look_back, 1), return_sequences=True))
    model.add(LSTM(7, input_shape=(look_back, 1)))
    model.add(Dense(1))
    model.compile(loss=Huber(delta=10), optimizer='adam')
    model.fit(trainX, train[1], epochs=100, batch_size=1, verbose=2)

    # make predictions
    trainPredict = model.predict(trainX)
    testPredict = model.predict(testX)

    # invert predictions
    trainPredict = inverse_transform(trainPredict)
    trainY = inverse_transform(train[1])
    testPredict = inverse_transform(testPredict)
    testY = inverse_transform(test[1])
    
    # calculate root mean squared error
    trainScore = math.sqrt(mean_squared_error(trainY[:,0], trainPredict[:,0]))
    print('Train Score: %.2f RMSE' % (trainScore))
    testScore = math.sqrt(mean_squared_error(testY[:,0], testPredict[:,0]))
    print('Test Score: %.2f RMSE' % (testScore))

    dataset = dist_series[0][1]
    a = np.array(dataset)
    dataset = a.reshape(a.shape[0], 1)

In [None]:
# [0-13]-> 14, 43 = trainPredict, 44, 51
# 52= 30 8 (38)

print(trainY.shape, trainPredict.shape)
print(testY.shape, testPredict.shape)

x1 = np.arange(1, trainY.shape[0]+1)
x2 = np.arange(trainY.shape[0]+1, trainY.shape[0]+1+testY.shape[0])

# Time Series
plt.plot(x1, trainY)
plt.plot(x1, trainPredict)
plt.plot(x2, testY)
plt.plot(x2, testPredict)
plt.show()

# Cumulative
x = np.append(x1, x2)
y_true = np.append(trainY, testY)
y_pred = np.append(trainPredict, testPredict)
plt.plot(x, np.cumsum(y_true))
plt.plot(x, np.cumsum(y_pred))
plt.show()