In [18]:
import numpy as np
import json
from tqdm import tqdm
import random
from os.path import exists

import torch
import torch.nn as nn
import torch.optim as optim

from datetime import datetime, timedelta
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

In [19]:
test_size = 7
seq_len = 7

In [20]:
global input_size
input_size = 300
train_test_threshold = {'sample.json':(30,60),'classified_data_0.json':(100,425), \
	'classified_data_1.json':(100,440), 'classified_data_2.json':(60,150)}


In [21]:
def feature_engineering(filename: str):
	global input_size
	with open(filename) as f:
		data = json.load(f)
	del data['上海市']
	
	total = 0
	useful = 0
	train_city = set()
	test_city = set()
	for city in data.values():
		total += len(city)
		if len(city) > train_test_threshold[filename][1]:
			test_city.add(city[0]['true_city'])
			train_city.add(city[0]['true_city'])
		elif len(city) > train_test_threshold[filename][0]:
			train_city.add(city[0]['true_city'])
		else:
			continue
		for p in city:
			if type(p['patient_vector']) == list:
				useful += 1	

	temp = np.empty((useful,input_size))
	temp_i = 0
	for city_string in train_city:
		city = data[city_string]
		for p in city:
			if type(p['patient_vector']) == list:
				temp[temp_i] = p['patient_vector']
				temp_i += 1

	pca = PCA(n_components=8)
	pca_result = pca.fit_transform(temp)

	temp_i = 0
	for city_string in train_city:
		city = data[city_string]
		for p in city:
			if type(p['patient_vector']) == list:
				p['patient_vector'] = pca_result[temp_i]
				temp_i += 1

	input_size = pca_result.shape[1]
	datas = {}
	for city_string in train_city:
		city = data[city_string]
		start_date = datetime.strptime(city[0]['time'],'%Y:%m:%d')
		end_date = datetime.strptime(city[-1]['time'],'%Y:%m:%d')
		period = (end_date-start_date).days+1
		CNT = np.zeros((period))
		VECTOR = np.zeros((period, input_size))
		cnt = 0
		vector = []
		day_idx = 0
		for p in city:
			if p['time'] != (start_date).strftime('%Y:%m:%d'):
				if any((type(v)==np.ndarray for v in vector)):
					vector = np.array([v for v in vector if type(v)==np.ndarray])
					VECTOR[day_idx] = np.mean(vector, axis=0)
				CNT[day_idx] = cnt
				cnt = 0
				vector = []
			while p['time'] != (start_date).strftime('%Y:%m:%d'):
				start_date += timedelta(days=1)
				day_idx += 1
			cnt += 1
			vector.append(p['patient_vector'])
		else:
			if any((type(v)==np.ndarray for v in vector)):
				vector = np.array([v for v in vector if type(v)==np.ndarray])
				VECTOR[day_idx] = np.mean(vector,axis=0)
			CNT[day_idx] = cnt
		temp = np.hstack((CNT.reshape(-1,1),VECTOR))
		datas[city_string] = temp	

	return datas, train_city, test_city

datas, train_city, test_city = feature_engineering('classified_data_0.json')

In [22]:
def remove_toomany_0(rdata:np.ndarray, seq_len:int):
	move0 = []
	for i in range(len(rdata)-seq_len):
		if any(rdata[i:i+seq_len,0]):
			move0.append(1)
		else:
			move0.append(0)
	temp_i = 0
	size = seq_len+1
	for i in range(1,len(move0)):
		if move0[i-1] or move0[i]:
			size += 1
	new_data = np.zeros((size,len(rdata[0])))
	new_data[0] = rdata[0]
	for i in range(1,len(move0)):
		if move0[i-1] or move0[i]:
			temp_i += 1
			new_data[temp_i] = rdata[i]
	new_data[-seq_len:]	= rdata[-seq_len:]
	return new_data

for city in datas:
	temp = datas[city]
	temp = remove_toomany_0(datas[city],seq_len)
	datas[city] = torch.tensor(temp.reshape(-1,1,input_size+1)).float()	

In [23]:
class Model_concat_gru(nn.Module):
    def __init__(self,input_size,hidden_size):
        super(Model_concat_gru, self).__init__()
        # concat
        self.gru = nn.GRU(input_size=input_size+1, \
            hidden_size=hidden_size)
        self.linear = nn.Linear(hidden_size, 1)

    def forward(self, x):
        output, _ = self.gru(x)
        output = output.squeeze()
        output = self.linear(output[-1]).squeeze()
        return output

In [24]:
class Model_concat_deep_gru(nn.Module):
    def __init__(self,input_size,hidden_size):
        super(Model_concat_deep_gru, self).__init__()
        # concat
        self.gru = nn.GRU(input_size=input_size+1, \
            hidden_size=hidden_size,num_layers=2,dropout=0.2)
        self.linear = nn.Linear(hidden_size, 1)

    def forward(self, x):
        output, _ = self.gru(x)
        output = output.squeeze()
        output = self.linear(output[-1]).squeeze()
        return output
        

In [25]:
class Model_add_gru(nn.Module):
    def __init__(self,input_size,hidden_size_1,hidden_size_2):
        super(Model_add_gru, self).__init__()
        # word embedding
        self.gru1 = nn.GRU(input_size=input_size, \
            hidden_size=hidden_size_1)
        # number
        self.gru2 = nn.GRU(input_size=1, \
            hidden_size=hidden_size_2)
        self.linear1= nn.Linear(hidden_size_1,1)
        self.linear2 = nn.Linear(hidden_size_2,1)

    def forward(self, x):
        x1, _ = self.gru1(x[:,:,1:])
        x2, _ = self.gru2(x[:,:,0:1])
        x1 = self.linear1(x1[-1]).squeeze()
        x2 = self.linear2(x2[-1]).squeeze()
        return x1+x2


In [26]:
class Model_add_deep_gru(nn.Module):
    def __init__(self,input_size,hidden_size_1,hidden_size_2):
        super(Model_add_deep_gru, self).__init__()
        # word embedding
        self.gru1 = nn.GRU(input_size=input_size, \
            hidden_size=hidden_size_1,num_layers=2,dropout=0.2)
        # number
        self.gru2 = nn.GRU(input_size=1, \
            hidden_size=hidden_size_2,num_layers=2,dropout=0.2)
        self.linear1= nn.Linear(hidden_size_1,1)
        self.linear2 = nn.Linear(hidden_size_2,1)

    def forward(self, x):
        x1, _ = self.gru1(x[:,:,1:])
        x2, _ = self.gru2(x[:,:,0:1])
        x1 = self.linear1(x1[-1]).squeeze()
        x2 = self.linear2(x2[-1]).squeeze()
        return x1+x2
        

In [27]:
def data_iter(data,seq_len,test_size,in_test):
	if in_test:
		indexs = np.arange(len(data)-seq_len-test_size)
	else:
		indexs = np.arange(len(data)-seq_len)
	random.shuffle(indexs)
	for i in indexs:
		yield data[i:i+seq_len],data[i+seq_len][0][0]
		

In [28]:
def train(model, datas, test_size):
    # create your optimizer
    optimizer = optim.Adam(model.parameters(), lr=2e-3)
    criterion = nn.MSELoss()
    
    epoches = 100
    seq_len = 7
    for epoch in tqdm(range(epoches)):  # loop over the dataset multiple times

        running_loss = 0.0
        city_keys = list(datas.keys())
        random.shuffle(city_keys)
        for city in city_keys:
            
            for data in data_iter(datas[city],seq_len,test_size,city in test_city):
                # get the inputs; data is a list of [inputs, labels]
                inputs, labels = data
                # zero the parameter gradients
                optimizer.zero_grad()
                # forward + backward + optimize
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()

        if epoch%10 == 9:
            print(f'{epoch+1}: {running_loss:.2f}')

        running_loss = 0.0
    
    print('Finished Training')


In [29]:
def test(model, datas, test_size):
	total_loss = 0.0
	for i in range(test_size, 0, -1):
		daily_loss = 0.0
		for city_string in test_city:
			inputs = datas[city_string][-i-seq_len:-i]
			outputs = model(inputs)
			y = float(outputs.relu())
			y_pred = [round(y)]
			y_true = [int(datas[city_string][-i][0][0])]
			print(city_string, f'{y:.3f}', y_pred[0], y_true[0])
			daily_loss += mean_squared_error(y_true, y_pred)
		print(daily_loss)
		total_loss += daily_loss
	print(total_loss)


In [30]:
# based on 'sample.json'
# hidden_size = (4, 8, 12)
# test_size = 2
# for hz in hidden_size:
# 	model = Model_concat_gru(input_size, hz)
# 	train(model, datas, test_size)
# 	test(model, datas, test_size)
# for hz in hidden_size:
# 	model = Model_concat_deep_gru(input_size, hz)
# 	train(model, datas, test_size)
# 	test(model, datas, test_size)
# hidden_size = ((2,4), (2,8), (4,4), (4,8))
# for hz1,hz2 in hidden_size:
# 	model = Model_add_gru(input_size, hz1, hz2)
# 	train(model, datas, test_size)
# 	test(model, datas, test_size)
# for hz1,hz2 in hidden_size:
# 	model = Model_add_deep_gru(input_size, hz1, hz2)
# 	train(model, datas, test_size)
# 	test(model, datas, test_size)


In [33]:
test_size = 7
model1 = Model_concat_gru(input_size, 8)
train(model1, datas, test_size)
test(model1, datas, test_size)


石家庄市 0.329 0 0
北京市 21.487 21 38
西安市 0.514 1 0
呼和浩特市 1.476 1 6
郑州市 1.380 1 18
604.0
石家庄市 0.285 0 0
北京市 33.268 33 10
西安市 0.815 1 10
呼和浩特市 3.253 3 5
郑州市 20.633 21 44
1143.0
石家庄市 0.725 1 0
北京市 15.607 16 34
西安市 7.926 8 0
呼和浩特市 5.432 5 1
郑州市 26.085 26 56
1305.0
石家庄市 0.260 0 0
北京市 42.604 43 14
西安市 12.358 12 6
呼和浩特市 0.654 1 4
郑州市 27.503 28 36
950.0
石家庄市 0.074 0 0
北京市 21.431 21 14
西安市 5.610 6 1
呼和浩特市 3.394 3 1
郑州市 20.796 21 47
754.0
石家庄市 0.373 0 0
北京市 10.929 11 24
西安市 1.186 1 11
呼和浩特市 0.242 0 1
郑州市 32.067 32 38
306.0
石家庄市 2.151 2 1
北京市 19.646 20 14
西安市 9.772 10 3
呼和浩特市 2.357 2 1
郑州市 36.429 36 17
448.0
5510.0


In [32]:
model2 = Model_concat_deep_gru(input_size, 4)
train(model2, datas, test_size)
test(model2, datas, test_size)


 10%|█         | 10/100 [01:06<10:12,  6.80s/it]

10: 89053.32


 20%|██        | 20/100 [02:11<09:16,  6.95s/it]

20: 86591.38


 30%|███       | 30/100 [03:14<07:12,  6.18s/it]

30: 79440.64


 40%|████      | 40/100 [04:19<06:39,  6.66s/it]

40: 80217.01


 50%|█████     | 50/100 [05:24<05:23,  6.46s/it]

50: 75721.38


 60%|██████    | 60/100 [06:32<04:38,  6.96s/it]

60: 76709.68


 70%|███████   | 70/100 [07:40<03:18,  6.61s/it]

70: 72639.87


 80%|████████  | 80/100 [08:52<02:28,  7.43s/it]

80: 75189.96


 90%|█████████ | 90/100 [10:04<01:08,  6.86s/it]

90: 73308.64


100%|██████████| 100/100 [11:15<00:00,  6.75s/it]

100: 75347.45
Finished Training
石家庄市 2.813 3 0
北京市 15.364 15 38
西安市 1.297 1 0
呼和浩特市 2.622 3 6
郑州市 3.128 3 18
773.0
石家庄市 1.161 1 0
北京市 32.921 33 10
西安市 0.613 1 10
呼和浩特市 3.681 4 5
郑州市 17.944 18 44
1288.0
石家庄市 1.005 1 0
北京市 18.859 19 34
西安市 4.849 5 0
呼和浩特市 4.064 4 1
郑州市 30.049 30 56
936.0
石家庄市 0.558 1 0
北京市 29.044 29 14
西安市 7.353 7 6
呼和浩特市 1.905 2 4
郑州市 30.575 31 36
256.0
石家庄市 0.653 1 0
北京市 13.081 13 14
西安市 7.853 8 1
呼和浩特市 7.633 8 1
郑州市 18.632 19 47
884.0
石家庄市 0.605 1 0
北京市 15.188 15 24
西安市 0.909 1 11
呼和浩特市 1.767 2 1
郑州市 31.282 31 38
232.0
石家庄市 1.080 1 1
北京市 13.696 14 14
西安市 6.608 7 3
呼和浩特市 2.140 2 1
郑州市 32.560 33 17
273.0
4642.0





In [34]:
model3 = Model_add_gru(input_size, 2, 4)
train(model3, datas, test_size)
test(model3, datas, test_size)


 10%|█         | 10/100 [01:00<09:15,  6.17s/it]

10: 83326.76


 20%|██        | 20/100 [02:07<09:03,  6.79s/it]

20: 77912.75


 30%|███       | 30/100 [03:06<06:27,  5.54s/it]

30: 75227.79


 40%|████      | 40/100 [04:01<05:47,  5.79s/it]

40: 75150.91


 50%|█████     | 50/100 [05:05<05:31,  6.62s/it]

50: 70768.58


 60%|██████    | 60/100 [06:13<04:18,  6.45s/it]

60: 70076.97


 70%|███████   | 70/100 [07:17<03:08,  6.29s/it]

70: 68789.20


 80%|████████  | 80/100 [08:20<02:03,  6.16s/it]

80: 67849.62


 90%|█████████ | 90/100 [09:22<01:02,  6.20s/it]

90: 66481.19


100%|██████████| 100/100 [10:30<00:00,  6.31s/it]

100: 65993.05
Finished Training
石家庄市 1.267 1 0
北京市 17.828 18 38
西安市 0.507 1 0
呼和浩特市 5.336 5 6
郑州市 4.611 5 18
572.0
石家庄市 1.011 1 0
北京市 25.435 25 10
西安市 0.668 1 10
呼和浩特市 6.078 6 5
郑州市 16.975 17 44
1037.0
石家庄市 1.156 1 0
北京市 10.071 10 34
西安市 6.849 7 0
呼和浩特市 5.329 5 1
郑州市 27.060 27 56
1483.0
石家庄市 0.823 1 0
北京市 27.810 28 14
西安市 4.746 5 6
呼和浩特市 0.974 1 4
郑州市 30.493 30 36
243.0
石家庄市 0.542 1 0
北京市 16.708 17 14
西安市 6.404 6 1
呼和浩特市 4.129 4 1
郑州市 32.633 33 47
240.0
石家庄市 0.576 1 0
北京市 11.885 12 24
西安市 3.112 3 11
呼和浩特市 2.442 2 1
郑州市 45.931 46 38
274.0
石家庄市 1.154 1 1
北京市 15.862 16 14
西安市 9.031 9 3
呼和浩特市 2.213 2 1
郑州市 37.340 37 17
441.0
4290.0



