# Colab 前置處理(一定要先執行以下所有程式)

- 取得雲端存取權限
  - 會出現網址要求登入(2次)，此時要把驗證碼複製並貼回。
- 建立相關暫存資料夾
- 將目錄移至FDA_final
- 下載套件

In [9]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

E: Package 'python-software-properties' has no installation candidate


KeyboardInterrupt: ignored

In [0]:
!mkdir -p Drive
!google-drive-ocamlfuse Drive

In [0]:
import os
#將資料目錄移至FDA_final
os.chdir("Drive/Colab Notebooks/FDA_final")

In [12]:
pip install -r requirements.txt



# 讀取病患個人資料

In [13]:
import time
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

EXECUTION_START_TIME = time.time() # 計算執行時間

# 讀取資料
training_set = pd.read_csv('datasets_527325_1157664_PatientInfo.csv')
print(training_set)
training_set.info()

      patient_id  global_num     sex  ...  released_date deceased_date     state
0     1000000001         2.0    male  ...     2020-02-05           NaN  released
1     1000000002         5.0    male  ...     2020-03-02           NaN  released
2     1000000003         6.0    male  ...     2020-02-19           NaN  released
3     1000000004         7.0    male  ...     2020-02-15           NaN  released
4     1000000005         9.0  female  ...     2020-02-24           NaN  released
...          ...         ...     ...  ...            ...           ...       ...
3514  7000000010         NaN  female  ...     2020-04-21           NaN  released
3515  7000000011         NaN    male  ...            NaN           NaN  isolated
3516  7000000012         NaN  female  ...     2020-05-05           NaN  released
3517  7000000013         NaN  female  ...     2020-04-26           NaN  released
3518  7000000014         NaN  female  ...            NaN           NaN  isolated

[3519 rows x 18 columns]
<c

# 資料前處理

以下是資料前處理的步驟:

 - 1.選取需要的欄位。
 - 2.處理缺失值，並將特定欄位的資料型態轉成浮點數。

In [14]:
# 資料分析與前處理

# 選取需要的欄位
df = training_set[['patient_id','sex','age','country','province','city','infection_case','infection_order','contact_number','confirmed_date','infected_by']]

# 處理缺失值並以"NONE"填充
df['sex'] = df['sex'].fillna("NONE")
df['age'] = df['age'].fillna("NONE")
df['country'] = df['country'].fillna("NONE")
df['province'] = df['province'].fillna("NONE")
df['city'] = df['city'].fillna("NONE")
df['infection_case'] = df['infection_case'].fillna("NONE")
df['confirmed_date'] = df['confirmed_date'].fillna("NONE")



# 處理缺失值並以算數平均數填充
df['infection_order'] = pd.to_numeric(df['infection_order'], errors = 'coerce')
df['infection_order'] = df['infection_order'].fillna(df['contact_number'].mean())
df['contact_number'] = pd.to_numeric(df['contact_number'], errors = 'coerce')
df['contact_number'] = df['contact_number'].fillna(df['contact_number'].mean())
df['infected_by'] = df['infected_by'].fillna(0).astype('int')

print("Result:")
df = df.sort_values(by=['confirmed_date'])
df.info()
print("\n\n")
print(df)

Result:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3519 entries, 948 to 3110
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   patient_id       3519 non-null   int64  
 1   sex              3519 non-null   object 
 2   age              3519 non-null   object 
 3   country          3519 non-null   object 
 4   province         3519 non-null   object 
 5   city             3519 non-null   object 
 6   infection_case   3519 non-null   object 
 7   infection_order  3519 non-null   float64
 8   contact_number   3519 non-null   float64
 9   confirmed_date   3519 non-null   object 
 10  infected_by      3519 non-null   int64  
dtypes: float64(2), int64(2), object(7)
memory usage: 329.9+ KB



      patient_id     sex   age  ... contact_number confirmed_date infected_by
948   1400000001  female   30s  ...      18.044753     2020-01-20           0
0     1000000001    male   50s  ...      75.000000     2020-01

# 每日確診人數性別統計

In [15]:
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as offline
%matplotlib inline

# 累積確診數量
plt.figure(figsize = (17,7))
result_dict = {}
male_dict = {}
female_dict = {}
info_dict = {}
result_avg = []
result_male = []
result_female = []

for i in df['confirmed_date']:
  if i in result_dict.keys():
    result_dict[i] += 1
  else:
    result_dict[i] = 0

del result_dict["NONE"]

temp = list(result_dict.keys())
for i in range(0,len(temp)):
  temp[i] = temp[i].replace('-','_')

# male
for i,j in zip(df['confirmed_date'], df['sex']):
  if i in male_dict.keys():
    if j == "male":
      male_dict[i] += 1
    elif j == "female":
      female_dict[i] += 1
  else:
    male_dict[i] = 0
    female_dict[i] = 0


del male_dict["NONE"]
del female_dict["NONE"]

result_avg = np.array(list(result_dict.values()))/2
result_male =  np.array(list(male_dict.values()))
result_female =  np.array(list(female_dict.values()))
fig = go.Figure()
fig = go.Figure(data=[
    go.Bar(name='male', x=temp, y=result_male),
    go.Bar(name='female', x=temp, y=result_female),
    go.Scatter(name='center', x=temp, y=result_avg)
])
# Change the bar mode
fig.update_layout(barmode='stack')

fig.show()
offline.plot(fig)

<Figure size 1224x504 with 0 Axes>

![A.PNG](graph/A.PNG?raw=true)

https://aaron-chang-ac.github.io/FDA_final/graph/A.html

In [16]:
import plotly.graph_objects as go
import plotly.express as px

# 累積確診數量
plt.figure(figsize = (17,7))
result_dict = {}
male_dict = {}
female_dict = {}
info_dict = {}
result_avg = []
result_male = []
result_female = []

for i in df['confirmed_date']:
  if i in result_dict.keys():
    result_dict[i] += 1
  else:
    result_dict[i] = 0

del result_dict["NONE"]

temp = list(result_dict.keys())
for i in range(0,len(temp)):
  temp[i] = temp[i].replace('-','_')

# male
for i,j in zip(df['confirmed_date'], df['sex']):
  if i in male_dict.keys():
    if j == "male":
      male_dict[i] += 1
    elif j == "female":
      female_dict[i] += 1
  else:
    male_dict[i] = 0
    female_dict[i] = 0


del male_dict["NONE"]
del female_dict["NONE"]

result_avg = np.array(list(result_dict.values()))/2
result_male =  np.array(list(male_dict.values()))
result_female =  np.array(list(female_dict.values()))

for i in range(1,len(result_avg)):
  result_avg[i] += result_avg[i-1]
  result_male[i] += result_male[i-1]
  result_female[i] += result_female[i-1]

fig = go.Figure()
fig = go.Figure(data=[
    go.Bar(name='male', x=temp, y=result_male),
    go.Bar(name='female', x=temp, y=result_female),
    go.Scatter(name='center', x=temp, y=result_avg)
])
# Change the bar mode
fig.update_layout(barmode='stack')

fig.show()
offline.plot(fig)

'temp-plot.html'

<Figure size 1224x504 with 0 Axes>

![B.PNG](graph/B.PNG?raw=true)

https://aaron-chang-ac.github.io/FDA_final/graph/B.html

# 群聚感染network graph

In [0]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

source = []
target = []
date = []
all_node = []
patient_date_dict={}

plt.figure(figsize = (100,50))

for i, j, k in zip(df['patient_id'], df['infected_by'], df['confirmed_date']):
  if j != 0:
    source.append(j)
    target.append(i)
  if k != "NONE":
    m = int(k.split("-")[1])
    patient_date_dict[i] = str(m)

edge_df = pd.DataFrame({ 'source': source, 'target': target})

G=nx.from_pandas_edgelist(edge_df, 'source', 'target', create_using=nx.Graph())
print(G.nodes())

for i in G.nodes():
  if i in patient_date_dict:
    date.append(int(patient_date_dict[i]))
  else:
    date.append(1)


carac = pd.DataFrame({ 'ID':G.nodes(), 'myvalue':date})
carac = carac.set_index('ID')
carac = carac.reindex(G.nodes())

nx.draw(G, with_labels=True, node_size=700, node_color=date, cmap=plt.cm.Blues)
plt.show()

Output hidden; open in https://colab.research.google.com to view.

![C.png](graph/C.png?raw=true)

https://aaron-chang-ac.github.io/FDA_final/graph/C.png

# 讀取股市資料

In [17]:
# 讀取資料
KOSPI = pd.read_csv('KOSPI-Historical-Data.csv')
print(KOSPI)
KOSPI.info()

             Date     Price      Open      High       Low     Vol. Change %
0    Jun 05, 2020  2,181.87  2,151.17  2,187.25  2,138.18  840.07K    1.43%
1    Jun 04, 2020  2,151.18  2,181.64  2,191.00  2,139.68    1.39B    0.19%
2    Jun 03, 2020  2,147.00  2,108.55  2,156.55  2,107.69    1.15B    2.87%
3    Jun 02, 2020  2,087.19  2,061.45  2,089.43  2,059.14    1.00B    1.07%
4    Jun 01, 2020  2,065.08  2,037.04  2,065.38  2,035.63  997.35M    1.75%
..            ...       ...       ...       ...       ...      ...      ...
101  Jan 08, 2020  2,151.31  2,156.27  2,162.32  2,137.72  913.83M   -1.11%
102  Jan 07, 2020  2,175.54  2,166.60  2,181.62  2,164.27  568.24M    0.95%
103  Jan 06, 2020  2,155.07  2,154.97  2,164.42  2,149.95  592.67M   -0.98%
104  Jan 03, 2020  2,176.46  2,192.58  2,203.38  2,165.39  631.56M    0.06%
105  Jan 02, 2020  2,175.17  2,201.21  2,202.32  2,171.84  494.68M   -1.02%

[106 rows x 7 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0

In [18]:
import plotly.graph_objects as go
import plotly.express as px

# 累積確診數量
KOSPI_df = KOSPI[['Date','Price']]
x = KOSPI_df['Date']
y = KOSPI_df['Price']
x = x.reindex(index=x.index[::-1])
y = y.reindex(index=y.index[::-1])
plt.figure(figsize = (17,7))
fig = go.Figure()
fig = go.Figure(data=[
    go.Scatter(name='center', x=x, y=y)
])
fig.show()
offline.plot(fig)

'temp-plot.html'

<Figure size 1224x504 with 0 Axes>

![D.PNG](graph/D.PNG?raw=true)

https://aaron-chang-ac.github.io/FDA_final/graph/D.html

In [19]:
EXECUTION_END_TIME = time.time() # 計算執行時間
print('total execution time: {}'.format(EXECUTION_END_TIME - EXECUTION_START_TIME))

total execution time: 278.03699016571045
