In [17]:
import pandas as pd
import matplotlib as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
from datetime import date, timedelta

In [2]:
header_by_day = ["#","Country","Date","TotalCases","NewCases","TotalDeaths","NewDeaths","TotalRecovered","NewRecovered",
"ActiveCases","Serious,Critical","Tot Cases/1M pop","Deaths/1M pop","TotalTests","Tests/1M pop","Population","Continent"]

header_by_week = ["#", "Country", "Cases in the last 7 days", "Cases in the preceding 7 days",
"Weekly case /%/ change", "Cases in the last 7 days/1M pop", "Deaths in the last 7 days", 
"Deaths in the preceding 7 days", "Weekly Death /%/ change", "Deaths in the last 7 days/1M pop", "Population", "Continent"]

In [3]:
def RepresentsInt(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

def convert_numeric(num):
    condition = {",":"","+":"","%":""}
    if num == "nan" or num == " " or num == "  " or num == " N/A":
        return 0

    for i, j in condition.items():
        num = num.replace(i, j)
    if RepresentsInt(num) == True:
        return int(num)
    return float(num)

In [4]:
df_covid_by_day = pd.read_csv('Corona_by_day.tsv',sep='\t')

In [5]:
for i in range(3,16):
    df_covid_by_day[header_by_day[i]] = df_covid_by_day[header_by_day[i]].apply(lambda x:convert_numeric(str(x)))
    
df_covid_by_day[header_by_day[16]] = df_covid_by_day[header_by_day[16]].apply(lambda x:str(x).replace(" ",""))
df_covid_by_day['Date'] = pd.to_datetime(df_covid_by_day['Date'],format="%d-%m-%Y")

In [6]:
df_covid_by_week = pd.read_csv('Corona_by_week.tsv',sep='\t')

In [7]:
for i in range(2,11):
    df_covid_by_week[header_by_week[i]] = df_covid_by_week[header_by_week[i]].apply(lambda x:convert_numeric(str(x)))
    
df_covid_by_week[header_by_week[-1]] = df_covid_by_week[header_by_week[-1]].apply(lambda x:str(x).replace(" ",""))

In [8]:
df_covid_by_day

Unnamed: 0,#,Country,Date,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,Population,Continent
0,1,USA,2022-02-28,80656025,34828,975951,843,53500262,278579,26179812,7770,241322,2920,950614834,2844228,334225917,NorthAmerica
1,2,India,2022-02-28,42931045,6915,514054,211,42324550,16864,92441,8944,30610,367,767481346,547211,1402532051,Asia
2,3,Brazil,2022-02-28,28787620,19516,649443,248,26336373,152750,1801804,8318,133853,3020,63776166,296539,215068475,SouthAmerica
3,4,France,2022-02-28,22702815,13483,138367,232,20901504,235191,1662944,2484,346537,2112,246629975,3764574,65513384,Europe
4,5,UK,2022-02-28,18886701,27312,161361,46,17449650,98709,1275690,289,275806,2356,480341745,7014519,68478212,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1125,222,Western Sahara,2022-03-04,10,0,1,0,8,0,1,0,16,2,0,0,621823,Africa
1126,223,MS Zaandam,2022-03-04,9,0,2,0,7,0,0,0,0,0,0,0,0,
1127,224,Marshall Islands,2022-03-04,7,0,0,0,7,0,0,0,117,0,0,0,59856,Australia/Oceania
1128,225,Saint Helena,2022-03-04,2,0,0,0,2,0,0,0,327,0,0,0,6107,Africa


In [9]:
df_covid_by_day.dtypes

#                            int64
Country                     object
Date                datetime64[ns]
TotalCases                   int64
NewCases                     int64
TotalDeaths                  int64
NewDeaths                    int64
TotalRecovered               int64
NewRecovered                 int64
ActiveCases                  int64
Serious,Critical             int64
Tot Cases/1M pop             int64
Deaths/1M pop                int64
TotalTests                   int64
Tests/1M pop                 int64
Population                   int64
Continent                   object
dtype: object

In [10]:
df_covid_by_week

Unnamed: 0,#,Country,Cases in the last 7 days,Cases in the preceding 7 days,Weekly case /%/ change,Cases in the last 7 days/1M pop,Deaths in the last 7 days,Deaths in the preceding 7 days,Weekly Death /%/ change,Deaths in the last 7 days/1M pop,Population,Continent
0,1,S. Korea,1293249,909271,42.0,25189.0,797,500,59,16.0,51342824,Asia
1,2,Germany,1075329,1143573,-6.0,12766.0,1387,1437,-3,16.0,84231046,Europe
2,3,Vietnam,734744,427980,72.0,7437.0,682,604,13,7.0,98799997,Asia
3,4,Russia,722996,1031455,-30.0,4951.0,5422,5408,0,37.0,146038899,Europe
4,5,Japan,459767,508696,-10.0,3654.0,1594,1557,2,13.0,125830265,Asia
...,...,...,...,...,...,...,...,...,...,...,...,...
212,213,Sierra Leone,0,4,-100.0,0.0,0,0,0,0.0,8247500,Africa
213,214,Tajikistan,0,3,-100.0,0.0,0,0,0,0.0,9894835,Asia
214,215,Tanzania,0,71,-100.0,0.0,0,2,-100,0.0,62570078,Africa
215,216,Ukraine,0,137426,-100.0,0.0,0,1399,-100,0.0,43295254,Europe


In [11]:
df_covid_by_week.dtypes

#                                     int64
Country                              object
Cases in the last 7 days              int64
Cases in the preceding 7 days         int64
Weekly case /%/ change              float64
Cases in the last 7 days/1M pop     float64
Deaths in the last 7 days             int64
Deaths in the preceding 7 days        int64
Weekly Death /%/ change               int64
Deaths in the last 7 days/1M pop    float64
Population                            int64
Continent                            object
dtype: object

### Phân chia dataset theo ngày thành 6 châu lục

Tuần không cần vì các chỉ số không cho phép ta groupby, nếu groupby sẽ bị sai lệch số liệu.

In [56]:
today = date.today()
yesterday = (today - timedelta(1)).strftime("%Y-%m-%d")

Nhớ pull về hoặc check trong dataset xem có dữ liệu ngày trước không, nếu không thông báo Lê Minh Trí gấp !!

In [57]:
df_covid_yesterday = df_covid_by_day[df_covid_by_day['Date'] == yesterday]

In [58]:
continent_yesterday_df = df_covid_yesterday.groupby("Continent").sum().drop("nan").reset_index()
continent_yesterday_df = continent_yesterday_df.drop(["#","Tot Cases/1M pop","Deaths/1M pop","Tests/1M pop"],axis=1)
continent_yesterday_df['Date'] = yesterday
continent_yesterday_df['Date'] = pd.to_datetime(continent_yesterday_df['Date'],format="%Y-%m-%d")

In [61]:
continent_yesterday_df

Unnamed: 0,Continent,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",TotalTests,Population,Date
0,Africa,11561635,5306,249960,55,10679433,7062,591880,1750,96419113,1394112109,2022-03-04
1,Asia,119447965,697132,1356497,2245,109084470,355344,9006998,30536,1946303804,4665933277,2022-03-04
2,Australia/Oceania,3756493,47851,7970,46,3298105,38944,382635,158,71630068,43394804,2022-03-04
3,Europe,159074158,720211,1719876,2552,137902769,962173,19451513,13619,2503221280,748383519,2022-03-04
4,NorthAmerica,95260400,69621,1413492,2031,66933528,234855,26903577,12699,1069588160,596812419,2022-03-04
5,SouthAmerica,54606125,106315,1261535,1093,46570176,153257,2659839,13100,210575346,436690699,2022-03-04


In [68]:
continent_df = df_covid_by_day.groupby("Date").sum().reset_index()
continent_df = continent_df.drop(["#","Tot Cases/1M pop","Deaths/1M pop","Tests/1M pop"],axis=1)
continent_df

Unnamed: 0,Date,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",TotalTests,Population
0,2022-02-28,437173429,1128262,5976416,6045,366621406,2154754,60360176,74759,5864353102,7884712500
1,2022-03-01,438510827,1337398,5983229,6813,368695782,2074376,59615435,74737,5869547415,7884712500
2,2022-03-02,440290515,1590173,5992432,7756,370783860,1939726,59289172,75953,5881445441,7884917270
3,2022-03-03,440377632,1599844,5993186,7872,370787000,1942866,59371827,75953,5881556015,7885122041
4,2022-03-04,443707497,1646436,6009345,8022,374469187,1751635,58996442,71862,5897737771,7885326827


### Câu 1,4,5 + World Chart (Toàn)

### Câu 2,3,6 (Trí)

### Pie Chart

Câu 2 : 
Pie Chart để so sánh tỷ lệ % giữa 6 châu lục về các thông số 
- Số ca nhiễm MỚI
- Số ca tử vong MỚI
- Số ca hồi phục MỚI
- Số ca trong cộng đồng

Mỗi thông số là một biểu đồ, 100% chính là tổng 6 châu lục từng thông số

In [62]:
piechart_plots = continent_yesterday_df[['Continent','NewCases','NewDeaths','NewRecovered','ActiveCases']]

In [63]:
specs = [[{'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(rows=2, cols=2, specs = specs, 
subplot_titles=['New Cases', 'New Deaths', 'New Recovered', 'Active Cases'])

fig.add_trace(go.Pie(labels=piechart_plots['Continent'], values=piechart_plots['NewCases']), 1, 1)
fig.add_trace(go.Pie(labels=piechart_plots['Continent'], values=piechart_plots['NewDeaths']), 1, 2)

fig.add_trace(go.Pie(labels=piechart_plots['Continent'], values=piechart_plots['NewRecovered']), 2, 1)
fig.add_trace(go.Pie(labels=piechart_plots['Continent'], values=piechart_plots['ActiveCases']), 2, 2)

### Line Chart

Linechart để trực quan theo thời gian 3 thông số NEW (cases/deaths/recovered). 

Lúc đầu là một biểu đồ chung của cả 3, sau đó tách ra riêng ra 3 biểu đồ lần lượt để xem hướng đi hay mô hình theo thời gian có giống nhau hay không.

In [71]:
linechart_plots = continent_df[['Date','NewCases','NewDeaths','NewRecovered']]

In [72]:
linechart_plots

Unnamed: 0,Date,NewCases,NewDeaths,NewRecovered
0,2022-02-28,1128262,6045,2154754
1,2022-03-01,1337398,6813,2074376
2,2022-03-02,1590173,7756,1939726
3,2022-03-03,1599844,7872,1942866
4,2022-03-04,1646436,8022,1751635


In [73]:
fig = px.line(linechart_plots, x="Date", y=['NewCases', 'NewDeaths','NewRecovered'], markers = True, title="World Overview") 
fig
fig.show()