In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import plotly.express as px
from datetime import date, timedelta
from sklearn.cluster import KMeans
from fbprophet import Prophet
from fbprophet.plot import plot_plotly, add_changepoints_to_plot
import plotly.offline as py
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm
from keras.models import Sequential
from keras.layers import LSTM,Dense
from keras.layers import Dropout
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator


IPython.utils.traitlets has moved to a top-level traitlets package.

Using TensorFlow backend.


In [2]:
path = './Data/'
patient_data_path = path + 'patient.csv'
route_data_path = path + 'route.csv'
time_data_path = path + 'time.csv'
case_data_path = path + 'case.csv'

df_case = pd.read_csv(case_data_path)
df_patient = pd.read_csv(patient_data_path)
df_route = pd.read_csv(route_data_path)
df_time = pd.read_csv(time_data_path)

# 1. Chinese brought virus to Korea

In [3]:
df_patient.head()

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
0,1,female,1984.0,China,filtered at airport,,,visit to Wuhan,1.0,,45.0,2020-01-20,2020-02-06,,released
1,2,male,1964.0,Korea,filtered at airport,,,visit to Wuhan,1.0,,75.0,2020-01-24,2020-02-05,,released
2,3,male,1966.0,Korea,capital area,,,visit to Wuhan,1.0,,16.0,2020-01-26,2020-02-12,,released
3,4,male,1964.0,Korea,capital area,,,visit to Wuhan,1.0,,95.0,2020-01-27,2020-02-09,,released
4,5,male,1987.0,Korea,capital area,,,visit to Wuhan,1.0,,31.0,2020-01-30,2020-03-02,,released


In [10]:
df_patient = df_patient.dropna(how='all')

In [11]:
df_patient.shape

(7869, 15)

# 1. Foreign income to Korea

In [24]:
df_patient['country'].value_counts()

Korea       7860
China          8
Mongolia       1
Name: country, dtype: int64

# 1.1 China income to Korea

In [13]:
df_patient[df_patient['country'] == 'China'].shape

(8, 15)

In [14]:
china_income = df_patient[df_patient['country'] == 'China']

In [15]:
china_income = china_income.reset_index(drop=True)

In [16]:
china_income

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
0,1,female,1984.0,China,filtered at airport,,,visit to Wuhan,1.0,,45.0,2020-01-20,2020-02-06,,released
1,12,male,1971.0,China,capital area,,,contact with patient in Japan,2.0,,422.0,2020-02-01,2020-02-18,,released
2,14,female,1980.0,China,capital area,,,contact with patient,3.0,12.0,3.0,2020-02-02,2020-02-18,,released
3,23,female,1962.0,China,capital area,,,visit to Wuhan,1.0,,23.0,2020-02-06,2020-02-29,,released
4,27,female,1982.0,China,capital area,,,visit to China,1.0,,40.0,2020-02-09,,,isolated
5,28,female,1989.0,China,capital area,,,contact with patient,2.0,3.0,1.0,2020-02-10,2020-02-17,,released
6,755,male,1954.0,China,capital area,,Eunpyeong St. Mary's Hospital,,,,,2020-02-24,,,isolated
7,924,female,1945.0,China,capital area,,,visit to China,,,,2020-02-25,,,isolated


In [17]:
infectedby = df_patient[df_patient['infected_by'].notna()]

In [18]:
infectedby = infectedby.reset_index(drop=True)

# patients infected by income Chinese

# 8 -> 3

In [19]:
infectedbyList = infectedby['infected_by'].isin(china_income['patient_id']).replace({False:np.nan}).dropna().index

In [20]:
infectedbyChinese = pd.DataFrame(infectedby, index=infectedbyList)

In [21]:
infectedbyChinese

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
4,14,female,1980.0,China,capital area,,,contact with patient,3.0,12.0,3.0,2020-02-02,2020-02-18,,released
10,25,female,1946.0,Korea,capital area,,,contact with patient,2.0,27.0,12.0,2020-02-09,2020-03-05,,released
11,26,male,1968.0,Korea,capital area,,,contact with patient,1.0,27.0,0.0,2020-02-09,,,isolated


In [22]:
infectedbyList2 = infectedby['infected_by'].isin(infectedbyChinese['patient_id']).replace({False:np.nan}).dropna().index

In [23]:
infectedbyList2

Int64Index([], dtype='int64')

# No patients infected by patients infected by income Chinese
# The most chinese patients are released state.

# = How systematic and good the Korean prevention of epidemics

# 1.2 Mongolia income to Korea

In [25]:
mongolia_income = df_patient[df_patient['country'] == 'Mongolia']

In [26]:
mongolia_income

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
874,875,male,1984.0,Mongolia,capital area,1.0,,,,,,2020-02-25,,2020-02-25,deceased


In [27]:
infectedbyList = infectedby['infected_by'].isin(mongolia_income['patient_id']).replace({False:np.nan}).dropna().index

In [28]:
infectedbyList

Int64Index([], dtype='int64')

# No patients infected by Mongolia patients

# = No big group infection or spread from foreign patients(China, Mongolia)

# 2. Koreans income from abroad to Korea

In [29]:
koreanPatient = df_patient[df_patient['country'] == 'Korea']

In [30]:
koreanPatient['infection_reason'].value_counts()

contact with patient                 74
visit to Daegu                       50
visit to Wuhan                        6
pilgrimage to Israel                  6
contact with patient in Singapore     2
residence in Wuhan                    2
visit to Thailand                     2
visit to Italy                        1
visit to Vietnam                      1
visit to ooo                          1
visit to Japan                        1
contact with patient in Daegu         1
Name: infection_reason, dtype: int64

In [63]:
foreignVisit = ['visit to Wuhan', 'pilgrimage to Israel', 'visit to Thailand', 'residence in Wuhan', 'contact with patient in Singapore', 'visit to Italy', 'visit to Japan', 'visit to Vietnam', 'visit to ooo']

In [64]:
korea_income = koreanPatient.loc[koreanPatient['infection_reason'].isin(foreignVisit)]

In [65]:
korea_income['patient_id'].shape

(22,)

In [66]:
korea_income

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
1,2,male,1964.0,Korea,filtered at airport,,,visit to Wuhan,1.0,,75.0,2020-01-24,2020-02-05,,released
2,3,male,1966.0,Korea,capital area,,,visit to Wuhan,1.0,,16.0,2020-01-26,2020-02-12,,released
3,4,male,1964.0,Korea,capital area,,,visit to Wuhan,1.0,,95.0,2020-01-27,2020-02-09,,released
4,5,male,1987.0,Korea,capital area,,,visit to Wuhan,1.0,,31.0,2020-01-30,2020-03-02,,released
6,7,male,1991.0,Korea,capital area,,,visit to Wuhan,1.0,,9.0,2020-01-30,2020-02-15,,released
7,8,female,1957.0,Korea,Jeollabuk-do,,,visit to Wuhan,1.0,,113.0,2020-01-31,2020-02-12,,released
12,13,male,1992.0,Korea,filtered at airport,,,residence in Wuhan,1.0,,0.0,2020-02-02,2020-02-24,,released
15,16,female,1977.0,Korea,Gwangju,,,visit to Thailand,1.0,,450.0,2020-02-04,2020-02-19,,released
16,17,male,1982.0,Korea,capital area,,,contact with patient in Singapore,2.0,,290.0,2020-02-05,2020-02-12,,released
18,19,male,1983.0,Korea,capital area,,,contact with patient in Singapore,2.0,,68.0,2020-02-05,2020-02-21,,released


In [67]:
infectedbyList = infectedby['infected_by'].isin(korea_income['patient_id']).replace({False:np.nan}).dropna().index

In [68]:
infectedbyKorean = pd.DataFrame(infectedby, index=infectedbyList)

In [69]:
infectedbyKorean

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
0,6,male,1964.0,Korea,capital area,,,contact with patient,2.0,3.0,17.0,2020-01-30,2020-02-19,,released
1,9,female,1992.0,Korea,capital area,,,contact with patient,2.0,5.0,2.0,2020-01-31,2020-02-24,,released
5,15,male,1977.0,Korea,capital area,,,contact with patient,2.0,4.0,15.0,2020-02-02,2020-02-24,,released
6,18,female,1999.0,Korea,Gwangju,,,contact with patient,2.0,16.0,8.0,2020-02-05,2020-02-19,,released
9,22,male,1973.0,Korea,Gwangju,,,contact with patient,2.0,16.0,1.0,2020-02-06,2020-02-15,,released
12,28,female,1989.0,China,capital area,,,contact with patient,2.0,3.0,1.0,2020-02-10,2020-02-17,,released


# 22->6

In [70]:
infectedbyList2 = infectedby['infected_by'].isin(infectedbyKorean['patient_id']).replace({False:np.nan}).dropna().index

In [71]:
infectedbyKorean2 = pd.DataFrame(infectedby, index=infectedbyList2)

In [72]:
infectedbyKorean2

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
2,10,female,1966.0,Korea,capital area,,,contact with patient,3.0,6.0,43.0,2020-01-31,2020-02-19,,released
3,11,male,1995.0,Korea,capital area,,,contact with patient,3.0,6.0,0.0,2020-01-31,2020-02-10,,released
7,20,female,1978.0,Korea,capital area,,,contact with patient,3.0,15.0,2.0,2020-02-05,2020-02-24,,released
8,21,female,1960.0,Korea,capital area,,,contact with patient,3.0,6.0,6.0,2020-02-05,2020-02-29,,released
23,83,male,1944.0,Korea,capital area,,,contact with patient,3.0,6.0,,2020-02-20,2020-03-01,,released
54,1252,female,1980.0,Korea,Daejeon,,,,,15.0,,2020-02-26,,,isolated
56,1257,female,1981.0,Korea,Daejeon,,,visit to Daegu,,6.0,,2020-02-26,,,isolated


# 22->6->7

In [73]:
infectedbyList3 = infectedby['infected_by'].isin(infectedbyKorean2['patient_id']).replace({False:np.nan}).dropna().index

In [74]:
infectedbyKorean3 = pd.DataFrame(infectedby, index=infectedbyList3)
infectedbyKorean3

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
13,29,male,1938.0,Korea,capital area,,,contact with patient,4.0,83.0,117.0,2020-02-16,,,isolated
15,32,female,2009.0,Korea,capital area,,,contact with patient,,20.0,,2020-02-18,2020-03-04,,released
22,56,male,1945.0,Korea,capital area,,,contact with patient,4.0,83.0,,2020-02-19,,,isolated
59,1568,male,1985.0,Korea,Daejeon,,,contact with patient,,1252.0,2.0,2020-02-27,,,isolated
63,1856,male,1985.0,Korea,Daejeon,,,contact with patient,,1252.0,2.0,2020-02-28,,,isolated
65,1913,male,1975.0,Korea,Daejeon,,,contact with patient,,1257.0,61.0,2020-02-28,,,isolated


# 22->6->7->6

In [75]:
infectedbyList4 = infectedby['infected_by'].isin(infectedbyKorean3['patient_id']).replace({False:np.nan}).dropna().index

In [76]:
infectedbyKorean4 = pd.DataFrame(infectedby, index=infectedbyList4)
infectedbyKorean4

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
14,30,female,1952.0,Korea,capital area,,,contact with patient,5.0,29.0,27.0,2020-02-16,,,isolated
26,136,male,1936.0,Korea,capital area,,,contact with patient,5.0,56.0,,2020-02-21,,,isolated


# 22->6->7->6->2

In [77]:
infectedbyList5 = infectedby['infected_by'].isin(infectedbyKorean4['patient_id']).replace({False:np.nan}).dropna().index

In [78]:
infectedbyKorean5 = pd.DataFrame(infectedby, index=infectedbyList5)
infectedbyKorean5

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
24,112,female,1941.0,Korea,capital area,,,contact with patient,5.0,136.0,,2020-02-21,,,isolated
33,362,male,1956.0,Korea,capital area,,,contact with patient,6.0,30.0,,2020-02-22,,,isolated


# 22->6->7->6->2->2

In [79]:
infectedbyList6 = infectedby['infected_by'].isin(infectedbyKorean5['patient_id']).replace({False:np.nan}).dropna().index

In [80]:
infectedbyKorean6 = pd.DataFrame(infectedby, index=infectedbyList6)
infectedbyKorean6

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state


In [81]:
frames = [korea_income, infectedbyKorean, infectedbyKorean2, infectedbyKorean3, infectedbyKorean4, infectedbyKorean5]

In [82]:
korea_income = pd.concat(frames)

In [86]:
korea_income.shape

(45, 15)

In [85]:
korea_income.reset_index(drop=True)

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
0,2,male,1964.0,Korea,filtered at airport,,,visit to Wuhan,1.0,,75.0,2020-01-24,2020-02-05,,released
1,3,male,1966.0,Korea,capital area,,,visit to Wuhan,1.0,,16.0,2020-01-26,2020-02-12,,released
2,4,male,1964.0,Korea,capital area,,,visit to Wuhan,1.0,,95.0,2020-01-27,2020-02-09,,released
3,5,male,1987.0,Korea,capital area,,,visit to Wuhan,1.0,,31.0,2020-01-30,2020-03-02,,released
4,7,male,1991.0,Korea,capital area,,,visit to Wuhan,1.0,,9.0,2020-01-30,2020-02-15,,released
5,8,female,1957.0,Korea,Jeollabuk-do,,,visit to Wuhan,1.0,,113.0,2020-01-31,2020-02-12,,released
6,13,male,1992.0,Korea,filtered at airport,,,residence in Wuhan,1.0,,0.0,2020-02-02,2020-02-24,,released
7,16,female,1977.0,Korea,Gwangju,,,visit to Thailand,1.0,,450.0,2020-02-04,2020-02-19,,released
8,17,male,1982.0,Korea,capital area,,,contact with patient in Singapore,2.0,,290.0,2020-02-05,2020-02-12,,released
9,19,male,1983.0,Korea,capital area,,,contact with patient in Singapore,2.0,,68.0,2020-02-05,2020-02-21,,released


# 22+6+7+6+2+2 = 45
# Total 45 patients are infected by Koreans income frome abroad
# assuming that more patients might be infected by Koreans income

# 3. Shincheonji church group infection

신천지 환자들로부터 infected by 출력해보고,

신천지 contact_number가 많은 환자들 region
df_patient daugu 

df_patient infection_reason이 visit to daugu 출력

In [87]:
df_SCJChurch = df_patient[df_patient['group'] == 'Shincheonji Church']

In [88]:
df_SCJChurch.shape

(58, 15)

In [89]:
df_SCJChurch.head()

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
30,31,female,1959.0,Korea,Daegu,,Shincheonji Church,,,,1160.0,2020-02-18,,,isolated
33,34,male,1996.0,Korea,Daegu,,Shincheonji Church,,,,,2020-02-18,,,isolated
34,35,female,1994.0,Korea,Daegu,,Shincheonji Church,,,,,2020-02-18,,,isolated
35,36,female,1972.0,Korea,Daegu,,Shincheonji Church,,,,,2020-02-18,,,isolated
38,39,female,1959.0,Korea,Gyeongsangbuk-do,,Shincheonji Church,,,,31.0,2020-02-18,2020-03-03,,released


In [90]:
infectedbyChurchList = infectedby['infected_by'].isin(df_SCJChurch['patient_id']).replace({False:np.nan}).dropna().index

In [91]:
infectedbyChurch = pd.DataFrame(infectedby, index=infectedbyChurchList)

In [92]:
infectedbyChurch.shape

(18, 15)

# 58->18

In [93]:
infectedbyChurch

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
16,33,female,1980.0,Korea,Daegu,,,contact with patient,,31.0,,2020-02-18,,,isolated
17,47,female,1957.0,Korea,Daegu,,Shincheonji Church,contact with patient,,31.0,,2020-02-19,,,isolated
18,48,female,1948.0,Korea,Daegu,,Shincheonji Church,contact with patient,,31.0,,2020-02-19,,,isolated
19,49,male,1962.0,Korea,Daegu,,Shincheonji Church,contact with patient,,31.0,,2020-02-19,,,isolated
20,50,male,1944.0,Korea,Daegu,,Shincheonji Church,contact with patient,,31.0,,2020-02-19,2020-03-02,,released
21,51,female,1959.0,Korea,Daegu,,Shincheonji Church,contact with patient,,31.0,,2020-02-19,2020-02-26,,released
27,140,female,1988.0,Korea,Daegu,,,contact with patient,,31.0,,2020-02-21,,,isolated
28,162,male,1987.0,Korea,Daegu,,,contact with patient,,31.0,,2020-02-21,,,isolated
29,164,male,1989.0,Korea,Gwangju,,Shincheonji Church,contact with patient,,126.0,,2020-02-21,,,isolated
35,441,female,1959.0,Korea,capital area,,,contact with patient,,246.0,,2020-02-23,,,isolated


In [94]:
infectedbyChurchList2 = infectedby['infected_by'].isin(infectedbyChurch['patient_id']).replace({False:np.nan}).dropna().index

In [97]:
infectedbyChurch2 = pd.DataFrame(infectedby, index=infectedbyChurchList2)

# 55->18->2

In [98]:
infectedbyChurch2.shape

(2, 15)

In [99]:
infectedbyChurch2

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
41,611,male,1985.0,Korea,Gwangju,,,contact with patient,,164.0,,2020-02-24,,,isolated
50,1227,male,1981.0,Korea,capital area,,,contact with patient,,835.0,,2020-02-26,,,isolated


In [100]:
infectedbyChurchList3 = infectedby['infected_by'].isin(infectedbyChurch2['patient_id']).replace({False:np.nan}).dropna().index

In [101]:
infectedbyChurch3 = pd.DataFrame(infectedby, index=infectedbyChurchList3)

In [102]:
infectedbyChurch3.shape

(0, 15)

# 58+18+2 = total 78 patients are infected from SCJ Church.

In [103]:
frames = [df_SCJChurch, infectedbyChurch, infectedbyChurch2]

In [104]:
SCJChurch_infect = pd.concat(frames)

In [106]:
SCJChurch_infect.shape

(78, 15)

In [107]:
SCJChurch_infect

Unnamed: 0,patient_id,sex,birth_year,country,region,disease,group,infection_reason,infection_order,infected_by,contact_number,confirmed_date,released_date,deceased_date,state
30,31,female,1959.0,Korea,Daegu,,Shincheonji Church,,,,1160.0,2020-02-18,,,isolated
33,34,male,1996.0,Korea,Daegu,,Shincheonji Church,,,,,2020-02-18,,,isolated
34,35,female,1994.0,Korea,Daegu,,Shincheonji Church,,,,,2020-02-18,,,isolated
35,36,female,1972.0,Korea,Daegu,,Shincheonji Church,,,,,2020-02-18,,,isolated
38,39,female,1959.0,Korea,Gyeongsangbuk-do,,Shincheonji Church,,,,31.0,2020-02-18,2020-03-03,,released
40,41,female,1951.0,Korea,Gyeongsangbuk-do,,Shincheonji Church,,,,,2020-02-19,,,isolated
41,42,female,1991.0,Korea,Daegu,,Shincheonji Church,,,,,2020-02-19,,,isolated
42,43,female,1962.0,Korea,Daegu,,Shincheonji Church,,,,,2020-02-19,,,isolated
43,44,female,1974.0,Korea,Daegu,,Shincheonji Church,,,,,2020-02-19,,,isolated
46,47,female,1957.0,Korea,Daegu,,Shincheonji Church,contact with patient,,31.0,,2020-02-19,,,isolated


In [109]:
df_patient['group'].value_counts()

Shincheonji Church               58
Eunpyeong St. Mary's Hospital    13
Cheongdo Daenam Hospital          9
Pilgrimage                        6
Name: group, dtype: int64

In [114]:
df_patient['region'].value_counts()

capital area           191
Gyeongsangbuk-do       140
Daegu                   57
Daejeon                 13
Gwangju                 11
Gangwon-do               5
Jeju-do                  4
filtered at airport      4
Jeollabuk-do             3
Jeollanam-do             3
Ulsan                    2
Chungcheongbuk-do        2
Chungcheongnam-do        1
Busan                    1
Name: region, dtype: int64