In [None]:
# Imports
import numpy as np
import pandas as pd
import math
import importlib as imp
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


In [None]:
# User module imports
from utils import district_daily_data as dd
dd = imp.reload(dd)

In [None]:
# Flags
linear_reg = False
sv_reg = True
episodes = True
lstm_model = True

In [None]:
# Directory variables
data_dir = 'data/'

In [None]:
# Read state data
df_state = pd.read_csv(data_dir + 'state-date-total-data.csv')
arr_state = df_state.to_numpy() # still reversed
arr_state = np.flipud(arr_state) # now taken data from day-1 to day-52; but still daily cases
arr_state = np.cumsum(arr_state, axis=0) # now cumulative cases till day 52
np.savetxt(data_dir + 'state-date-total-data-cumulative.csv', arr_state.astype(int), fmt='%i', delimiter=",")

In [None]:
# Read population density data
df_population_density = pd.read_csv(data_dir + 'district_wise_population_density.csv')
np_population_density = df_population_density.to_numpy() 
data_found_count = 0 # no of districts for which we have population density data

def get_district_population_density(d):
    global data_found_count
    dist_pop_density = -2
    for i_cn in range(len(np_population_density)):
        if(np_population_density[i_cn][1].lower().count(d.lower().strip()) > 0):
            dist_pop_density = max(float(np_population_density[i_cn][7]), dist_pop_density)
    if(dist_pop_density <= 0): # print(d) # district not matched || area not found || population data missing
        dist_pop_density = 368 # population density of INDIA
    else:
        data_found_count = data_found_count + 1
    return dist_pop_density



In [None]:
# Read district data
districts = dd.get_all_districts()
dist_series = []  # [(start_date, series), (start_date, series), ...]

# Note: start_date might itself be a feature
for d in districts:
    if d == "Mumbai":
        d_start_date = dd.get_infection_start(d)
        district_pop_density = get_district_population_density(d)
        dist_series.append((d_start_date, dd.get_district_time_series(d, d_start_date), district_pop_density))
#     print(d_start_date)

print("data_found_count:", data_found_count,  " tot dists:", len(districts))

In [None]:
if linear_reg:
    # Get x and y plots - LinearRegression
    X = np.arange(1,53)
    X = np.reshape(X, (52,1))
    print (arr_state.shape)

    for i in range(len(arr_state[0])):
        y = arr_state[:,i]
        y = np.reshape(y, (52,1))
        reg = LinearRegression().fit(X, y)
        x_test = np.arange(1,56).reshape(55,1)
        y_test = reg.predict(x_test)
        plt.scatter(X, y, color='black')
        plt.plot(x_test,y_test)
        plt.show()

In [None]:
if sv_reg and not episodes:
    # Get x and y plots - SVRegression
    X = np.arange(1,53)
    X = np.reshape(X, (52,1))

    for i in range(len(arr_state[0])):
        y = arr_state[:,i]
        x_test = np.arange(1,56).reshape(55,1)
        clf = SVR(C=100.0, gamma=100)
        clf.fit(X, y)
        y_test = clf.predict(x_test)

        plt.scatter(X, y, color='black')
        plt.plot(x_test,y_test)
        plt.show()

In [None]:
if episodes:
    # Construct train and test data and fit Support Vector Regression
    x = []
    y = []
    episode_length = 14
    count = 0
    for tup in dist_series:
        series = tup[1]
        dist_pop_density = tup[2]
        print (len(series))
        num_episodes = len(series) - episode_length + 1
        if num_episodes < 2: continue
        print (num_episodes)
        for _in in range(num_episodes-1):
            x.append([dist_pop_density] + series[_in:_in+episode_length])
            y.append(series[_in+episode_length])
    print (len(x))
    x = np.array(x)
    y = np.array(y)

    train_length = int(0.8*len(x))
    x_train = x[:train_length]
    y_train = y[:train_length]
    x_test = x[train_length:]
    y_true = y[train_length:]
    clf = SVR(C=100.0, gamma='scale')
    clf.fit(x, y)
    
    y_test = clf.predict(x)
    X = np.arange(len(y))
    X = np.reshape(X, (len(y), 1))
    
    # Test
    # y_test = clf.predict(x_test)
    # X = np.arange(len(y_test))
    # X = np.reshape(X, (len(y_test), 1))
    plt.plot(X, y, color='black')
    plt.plot(X, y_test, color='red')

In [None]:
# print (x_train[0])
# type(np.array(series))
x_train.shape

In [None]:
if lstm_model:
    # Construct train and test data and fit Support Vector Regression
    x = []
    y = []
    episode_length = 14
    count = 0
    for tup in dist_series:
        series = tup[1]
        a = np.array(series)
        series = a.reshape(a.shape[0], 1)

        scaler = MinMaxScaler(feature_range=(0, 1))
        series = scaler.fit_transform(series)

        dist_pop_density = tup[2] 
        print (len(series))
        num_episodes = len(series) - episode_length + 1
        if num_episodes < 2: continue
        print (num_episodes)
        for _in in range(num_episodes-1-6):
            x.append(series[_in:_in+episode_length])
            y.append(series[_in+episode_length+6])
    print (len(x))
    x = np.array(x)
    y = np.array(y)

    train_length = int(0.67*len(x))
    x_train = x[:train_length]
    y_train = y[:train_length]
    x_test = x[train_length:]
    y_true = y[train_length:]

    # reshape input to be [samples, time steps, features]
    np.random.seed(7)
    look_back = 14
    trainX = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
    testX = np.reshape(x_test, (x_test.shape[0], 1, x_test.shape[1]))

    # create and fit the LSTM network
    model = Sequential()
    model.add(LSTM(4, input_shape=(1, look_back)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(trainX, y_train, epochs=100, batch_size=1, verbose=2)

    # make predictions
    trainPredict = model.predict(trainX)
    testPredict = model.predict(testX)

    # invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform(y_train)
    testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform(y_true)
    # calculate root mean squared error
    trainScore = math.sqrt(mean_squared_error(trainY[:,0], trainPredict[:,0]))
    print('Train Score: %.2f RMSE' % (trainScore))
    testScore = math.sqrt(mean_squared_error(testY[:,0], testPredict[:,0]))
    print('Test Score: %.2f RMSE' % (testScore))

    dataset = dist_series[0][1]
    a = np.array(dataset)
    dataset = a.reshape(a.shape[0], 1)


In [None]:
# [0-13]-> 14, 43 = trainPredict, 44, 51
# 52= 30 8 (38)

print(trainY.shape, trainPredict.shape)
print(testY.shape, testPredict.shape)

x1 = np.arange(1, trainY.shape[0]+1)
x2 = np.arange(trainY.shape[0]+1, trainY.shape[0]+1+testY.shape[0])

# plt.plot(x1, trainY)
# plt.plot(x1, trainPredict)
plt.plot(x2, testY)
plt.plot(x2, testPredict)

plt.show()

In [None]:
# print(trainY[:,0])
# print(trainPredict[:, 0])
# dataset.shape
# trainX.shape
# testX.shape
# plt.plot(dataset)
# plt.plot(trainPredictPlot)
# plt.plot(testPredictPlot)
# plt.show()
# trainPredictPlot