# Get Ready

## Install required libraries
Uncomment lines for installing

In [1]:
# !pip install nest_asyncio
# !pip install pyecharts

## Import library

In [2]:
import warnings
import pandas as pd
import nest_asyncio
from snapshot_pyppeteer import snapshot

from pyecharts import *
from pyecharts.charts import Map
from pyecharts import options as opts
from pyecharts.globals import ThemeType
from pyecharts.render import make_snapshot
from pyecharts.globals import CurrentConfig, NotebookType

from IPython.core.interactiveshell import InteractiveShell

## Some settings for this code file

In [12]:
!jupyter trust Data-Analysis.ipynb
InteractiveShell.ast_node_interactivity = "all"
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB
theme=ThemeType.LIGHT

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 50)

warnings.simplefilter(action='ignore', category=FutureWarning)

nest_asyncio.apply()

data_file_root_path = "Data"

data_source = "https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset/data"

Notebook already signed: Data-Analysis.ipynb


# Data Analysis

In [4]:
def read_file(file_name):
    file_data = pd.read_csv(f"{data_file_root_path}/novel-corona-virus-2019-dataset/{file_name}").fillna(0)
    file_data.rename(
        columns={'Country/Region' : 'CountryRegion', 
                 'Province/State' : "ProvinceState"}, 
                 inplace=True)
    return file_data

In [5]:
def cal_new_confirmed(a_df):
    confirmed_col = a_df['Confirmed']
    new_confirmed = []
    new_confirmed.append(confirmed_col[0])

    for i in range(len(confirmed_col)):
        try:
            new_confirmed.append(int(confirmed_col[i + 1] - confirmed_col[i]))
        except KeyError:
            break

    a_df["NewConfirmed"] = new_confirmed
    return a_df


## Basic Summary

In [6]:
covid_19_data = read_file("covid_19_data.csv")
covid_19_data["Active"] = covid_19_data['Confirmed'] - covid_19_data['Deaths'] - covid_19_data['Recovered']
start_obser_date = covid_19_data["ObservationDate"].iloc[0]
latest_obser_date = covid_19_data["ObservationDate"].iloc[-1]
covid_19_data

Unnamed: 0,SNo,ObservationDate,ProvinceState,CountryRegion,Last Update,Confirmed,Deaths,Recovered,Active
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0,1.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0,14.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0,6.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0,1.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
24823,24824,05/14/2020,Wyoming,US,2020-05-15 02:33:02,701.0,7.0,0.0,694.0
24824,24825,05/14/2020,Xinjiang,Mainland China,2020-05-15 02:33:02,76.0,3.0,73.0,0.0
24825,24826,05/14/2020,Yukon,Canada,2020-05-15 02:33:02,11.0,0.0,11.0,0.0
24826,24827,05/14/2020,Yunnan,Mainland China,2020-05-15 02:33:02,185.0,2.0,183.0,0.0


### Overall cases count

In [7]:
grouped_static = covid_19_data.groupby('ObservationDate')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
grouped_static = cal_new_confirmed(grouped_static)
total_static = grouped_static[grouped_static["ObservationDate"] == max(grouped_static["ObservationDate"])].reset_index(drop=True)
total_static.style.background_gradient(cmap='Pastel1').format({
    "Confirmed": "{:,.0f}", 
    "Deaths": "{:,.0f}", 
    "Recovered": "{:,.0f}",
    "Active": "{:,.0f}",
    "NewConfirmed": "{:,.0f}",
    })


Unnamed: 0,ObservationDate,Confirmed,Deaths,Recovered,Active,NewConfirmed
0,05/14/2020,4442163,302418,1587893,2551852,95145


### Cases per country/Region

In [8]:
print(f"Latest Record of Data: {latest_obser_date} \n")

latest_covid_19_data = covid_19_data[covid_19_data["ObservationDate"] == max(covid_19_data["ObservationDate"])].reset_index()

basic_static = latest_covid_19_data.groupby(["CountryRegion"])['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
basic_static = basic_static.sort_values(by='Confirmed', ascending=False).reset_index(drop=True)
basic_static.index += 1 
basic_static.style.background_gradient(cmap='Reds').format(
    {"Confirmed": "{:,.0f}", 
     "Deaths": "{:,.0f}", 
     "Recovered": "{:,.0f}",
     "Active": "{:,.0f}",}
    )


Latest Record of Data: 05/14/2020 



Unnamed: 0,CountryRegion,Confirmed,Deaths,Recovered,Active
1,US,1417774,85898,246414,1085462
2,Russia,252245,2305,53530,196410
3,UK,234440,33693,1043,199704
4,Spain,229540,27321,143374,58845
5,Italy,223096,31368,115288,76440
6,Brazil,203165,13999,79479,109687
7,France,178994,27428,59719,91847
8,Germany,174478,7884,150300,16294
9,Turkey,144749,4007,104030,36712
10,Iran,114533,6854,90539,17140


### Plot top 15 countries data

In [13]:
def plot_ebar(a_df, type_str: str) -> charts.Bar:
    country = a_df["CountryRegion"].to_list()
    y1 = a_df[type_str].to_list()
    
    color_dict = {
        "Confirmed": "#FF5252",
        "Recovered": "#00BFA5",
        "Deaths": "#FF6D00"
    }
    
    bar = (
        charts.Bar(init_opts=opts.InitOpts(
            theme=ThemeType.LIGHT,
            width="1350px",
            height="800px"
        ))
        .add_xaxis(
            country,
        )
        .add_yaxis(
            type_str, y1,
            itemstyle_opts=opts.ItemStyleOpts(
                color = color_dict[type_str]
            ),
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title=f'{type_str} Number of Top 15 Confirmed Count Countries',
                subtitle=f"from {start_obser_date} to {latest_obser_date}",
                pos_top=0
            ),
            yaxis_opts=opts.AxisOpts(
                # name=f'Number of cases',
                name_location="center",
                
            ),
            xaxis_opts=opts.AxisOpts(
                name="Country/\nRegion",
                axislabel_opts = opts.LabelOpts(
                    interval=0,
                    rotate=25,
                    margin=10
                )
            ),
            legend_opts=opts.LegendOpts(
                is_show=True,
                pos_right=100,
            ),
        )
    )
    
    file_name = f"{type_str}-bar"
    
    make_snapshot(
        snapshot, 
        bar.render(f"Images/{file_name}.html"), 
        f"Images/{file_name}.png"
    )
    
    return bar

In [10]:
top_15_countries = basic_static.head(15)

In [None]:
top_15_confirmed_bar = plot_ebar(top_15_countries, "Confirmed")
top_15_confirmed_bar.load_javascript()
top_15_confirmed_bar.render_notebook()

In [None]:
top_15_deaths_bar = plot_ebar(top_15_countries, "Deaths")
top_15_deaths_bar.load_javascript()
top_15_deaths_bar.render_notebook()

In [None]:
top_15_recovered_bar = plot_ebar(top_15_countries, "Recovered")
top_15_recovered_bar.load_javascript()
top_15_recovered_bar.render_notebook()

## Tendency

### Functions for trend analysis

#### Make trend table

In [None]:
def make_trend_table(country: str):
    line_data = covid_19_data[covid_19_data['CountryRegion']==country]
    line_data = line_data.groupby(["ObservationDate"])['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
    line_data["ObservationDate"] = pd.to_datetime(line_data["ObservationDate"], format='%m/%d/%Y')
    
    return line_data

#### Plot trend data

In [None]:
def plot_line_trend(trend_df, country: str) -> charts.Line:
    date = trend_df["ObservationDate"].astype(str).to_list()
    y1 = trend_df["Confirmed"].to_list()
    y2 = trend_df["Deaths"].to_list()
    y3 = trend_df["Recovered"].to_list()
    y4 = trend_df["Active"].to_list()
    y5 = trend_df["NewConfirmed"].to_list()
    
    line = (
    charts.Line(init_opts=opts.InitOpts(
            theme=ThemeType.LIGHT,
            width="1350px",
            height="800px"
    ))
    .add_xaxis(xaxis_data=date)
    .add_yaxis(
        series_name="Confirmed",
        y_axis=y1,
        label_opts=opts.LabelOpts(is_show=False),
        # is_smooth=True,
        linestyle_opts=opts.LineStyleOpts(width=3),
    )
        .add_yaxis(
        series_name="Recovered",
        y_axis=y3,
        label_opts=opts.LabelOpts(is_show=False),
        # is_smooth=True,
        linestyle_opts=opts.LineStyleOpts(width=3),
    )
    .add_yaxis(
        series_name="Deaths",
        y_axis=y2,
        label_opts=opts.LabelOpts(is_show=False),
        # is_smooth=True,
        linestyle_opts=opts.LineStyleOpts(width=3),
    )
    .add_yaxis(
        series_name="Active",
        y_axis=y4,
        label_opts=opts.LabelOpts(is_show=False),
        # is_smooth=True,
        linestyle_opts=opts.LineStyleOpts(width=3),
    )
    .add_yaxis(
        series_name="New Confirmed",
        y_axis=y5,
        label_opts=opts.LabelOpts(is_show=False),
        # is_smooth=True,
        linestyle_opts=opts.LineStyleOpts(width=3),
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title=f"COVID-19 Trend of {country}",
            subtitle=f"from {start_obser_date} to {latest_obser_date}"
        ),
        tooltip_opts=opts.TooltipOpts(trigger="axis"),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            axistick_opts=opts.AxisTickOpts(is_show=True),
            splitline_opts=opts.SplitLineOpts(is_show=True),
        ),
        xaxis_opts=opts.AxisOpts(type_="category", boundary_gap=False),
    ))
    
    file_name = f"{country}-trend"
    
    make_snapshot(
        snapshot, 
        line.render(f"Images/{file_name}.html"), 
        f"Images/{file_name}.png"
    )
    
    return line
    

### China Trend

In [None]:
china_line_data = make_trend_table('Mainland China')
china_line_data = cal_new_confirmed(china_line_data)
china_line_data

In [None]:
china_line = plot_line_trend(china_line_data, 'Mainland China')
china_line.load_javascript()
china_line.render_notebook()

### Singapore Trend

In [None]:
singapore_line_data = make_trend_table('Singapore')
singapore_line_data = cal_new_confirmed(singapore_line_data)
singapore_line = plot_line_trend(singapore_line_data, 'Singapore')
singapore_line.load_javascript()
singapore_line.render_notebook()

### Japan Trend

In [None]:
japan_line_data = make_trend_table('Japan')
japan_line_data = cal_new_confirmed(japan_line_data)
japan_line = plot_line_trend(japan_line_data, 'Japan')
japan_line.load_javascript()
japan_line.render_notebook()

### South Korea Trend

In [None]:
south_korea_line_data = make_trend_table('South Korea')
south_korea_line_data = cal_new_confirmed(south_korea_line_data)
south_korea_line = plot_line_trend(south_korea_line_data, 'South Korea')
south_korea_line.load_javascript()
south_korea_line.render_notebook()

### India Trend

In [None]:
india_line_data = make_trend_table('India')
india_line_data = cal_new_confirmed(india_line_data)
india_line = plot_line_trend(india_line_data, 'India')
india_line.load_javascript()
india_line.render_notebook()

### United Kingdom Trend

In [None]:
uk_line_data = make_trend_table('UK')
uk_line_data = cal_new_confirmed(uk_line_data)
uk_line = plot_line_trend(uk_line_data, 'UK')
uk_line.load_javascript()
uk_line.render_notebook()

### Italy Trend

In [None]:
italy_line_data = make_trend_table('Italy')
italy_line_data = cal_new_confirmed(italy_line_data)
italy_line = plot_line_trend(italy_line_data, 'Italy')
italy_line.load_javascript()
italy_line.render_notebook()

### Spain Trend

In [None]:
spain_line_data = make_trend_table('Spain')
spain_line_data = cal_new_confirmed(spain_line_data)
spain_line = plot_line_trend(spain_line_data, 'Spain')
spain_line.load_javascript()
spain_line.render_notebook()

### Australia Trend

In [None]:
australia_line_data = make_trend_table('Australia')
australia_line_data = cal_new_confirmed(australia_line_data)
australia_line = plot_line_trend(australia_line_data, 'Australia')
australia_line.load_javascript()
australia_line.render_notebook()

### France Trend

In [None]:
france_line_data = make_trend_table('France')
france_line_data = cal_new_confirmed(france_line_data)
france_line = plot_line_trend(france_line_data, 'France')
france_line.load_javascript()
france_line.render_notebook()

### America Trend

In [None]:
us_line_data = make_trend_table('US')
us_line_data = cal_new_confirmed(us_line_data)
us_line = plot_line_trend(us_line_data, 'US')
us_line.load_javascript()
us_line.render_notebook()

### Hong Kong Trend

In [None]:
hk_line_data = make_trend_table('Hong Kong')
hk_line_data = cal_new_confirmed(hk_line_data)
hk_line = plot_line_trend(hk_line_data, 'Hong Kong')
hk_line.load_javascript()
hk_line.render_notebook()

## Make Pie Chart function

In [None]:
def plot_grouping_pie_chart(grouped_df, group_name: str) -> charts.Pie:
   
    labels = grouped_df[group_name]
    percentages = grouped_df["Percent"]       
    
    pie = (
        charts.Pie(init_opts=opts.InitOpts(
            theme=ThemeType.LIGHT
        ))
        .add(
            "", [list(z) for z in zip(labels, percentages)],
            radius=["40%", "75%"],
            rosetype="percentages"
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title=f"COVID-19 Confirmed {group_name} Group",
                subtitle= f"from {start_obser_date} to {latest_obser_date}"
            ),
            legend_opts=opts.LegendOpts(
                orient='vertical',
                is_show=True,
                pos_right=10,
                pos_top=50
            ),

        )
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    )
    
    make_snapshot(
        snapshot, 
        pie.render(f"Images/{group_name}-grouping-percentage.html"), 
        f"Images/{group_name}-grouping-percentage.png"
    )
    return pie

In [None]:
def make_pie_df(a_df):
    new_grouped = a_df.reset_index(name="Count")
    percent = []
    sum = 0
    for c in new_grouped["Count"]:
        sum += c

    for c in new_grouped["Count"]:
        percent.append(round(c / sum, 3))

    new_grouped["Percent"] = percent
    return new_grouped

## Age Group

### Process Data

In [None]:
COVID19_open_line_list = read_file("COVID19_open_line_list.csv")
null_index = COVID19_open_line_list[COVID19_open_line_list["ID"] == 0.0].index
COVID19_open_line_list.drop(null_index, inplace=True)
COVID19_open_line_list = COVID19_open_line_list.loc[:, ~COVID19_open_line_list.columns.str.contains('^Unnamed')]
COVID19_open_line_list = COVID19_open_line_list.loc[:, ~COVID19_open_line_list.columns.str.contains('^admin')]
COVID19_open_line_list = COVID19_open_line_list.replace({
    "male" : "Male",
    "female" : "Female",
})

In [None]:
age_series = COVID19_open_line_list["age"].astype(str)

for age in age_series:
    if '-' in age:
        age_range = age.split('-')
        a1 = int(age_range[0])
        a2 = int(age_range[1])
        a = int((a1 + a2) / 2)
        age_series = age_series.replace(age, a)

COVID19_open_line_list["age"] = age_series.astype(float)
COVID19_open_line_list

In [None]:
list_bins = [1, 10, 20, 30, 40, 50, 60, 70, 80, 100]
list_label = ['0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-100']

age_grouped = pd.cut(COVID19_open_line_list["age"], bins=list_bins, labels=list_label, include_lowest=True)

age_group_df = pd.Series(age_grouped, name=("Age")).to_frame()
age_group_df = age_group_df.groupby(["Age"]).size()

new_age_grouped = make_pie_df(age_group_df)
new_age_grouped

### Plot Pie Chart

In [None]:
age_pie = plot_grouping_pie_chart(new_age_grouped, "Age")
age_pie.load_javascript()
age_pie.render_notebook()

## Gender Group

In [None]:
gender_group = COVID19_open_line_list.groupby("sex")
gender_group_count = gender_group.size()
gender_group_count.index.name = "Gender"
gender_group_count = make_pie_df(gender_group_count).replace(0, "Not Reported")
gender_group_count

In [None]:
gender_pie = plot_grouping_pie_chart(gender_group_count, "Gender")
gender_pie.load_javascript()
gender_pie.render_notebook()

## Map
Reference: [A Complete Guide to an Interactive Geographical Map using Python](https://towardsdatascience.com/a-complete-guide-to-an-interactive-geographical-map-using-python-f4c5197e23e0)

In [None]:
plot_geo_df = basic_static
plot_geo_df = plot_geo_df.replace({
    "US" : "United States",
    "Mainland China" : "China",
    "UK" : "United Kingdom",
    "Congo (Brazzaville)" : "Dem. Rep. Congo",
    "Congo (Kinshasa)" : "Congo",
    "Burma" : "Myanmar",
    "South Sudan" : "S. Sudan",
    "Central African Republic" : "Central African Rep.",
    "Western Sahara" : "W. Sahara",
    "South Korea" : "Korea",
    "Czech Republic" : "Czech Rep.",
    "Dominican Republic" : "Dominican Rep.",
    "Ivory Coast" : "Côte d'Ivoire",
    "Laos" : "Lao PDR",
    "North Macedonia" : "Macedonia",
    "Bosnia and Herzegovina" : "Bosnia and Herz.",
    "Equatorial Guinea" : "Eq. Guinea"
})
    
    
locate = plot_geo_df["CountryRegion"]
confirmed_cases = plot_geo_df["Confirmed"].astype(int)
recovered_cases = plot_geo_df["Recovered"].astype(int)
deaths_cases = plot_geo_df["Deaths"].astype(int)



In [None]:
def plot_map(cases, type_str):
    file_name = f"COVID-19-Global-{type_str}-Cases"
    
    map = (
        charts.Map(init_opts=opts.InitOpts(
                theme=ThemeType.LIGHT,
                width="1350px",
                height="800px"
        ))
        .add(
            f"World {type_str} Cases", 
            [list(z) for z in zip(locate.to_list(), cases.to_list())], 
            "world",
            is_map_symbol_show=False,
            zoom=1.2
        )
        .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title=file_name.replace('-', " "),
                subtitle=f"from {start_obser_date} to {latest_obser_date}"
            ),
            visualmap_opts=opts.VisualMapOpts(max_=300000),
        )
    )
        
    make_snapshot(
            snapshot, 
            map.render(f"Images/{file_name}.html"), 
            f"Images/{file_name}.png"
        )
    return map

In [None]:
world_confirmed_map = plot_map(confirmed_cases, "Comfirmed")
world_confirmed_map.load_javascript()
world_confirmed_map.render_notebook()

In [None]:
world_recovered_map = plot_map(recovered_cases, "Recovered")
world_recovered_map.load_javascript()
world_recovered_map.render_notebook()

In [None]:
world_deaths_map = plot_map(deaths_cases, "Deaths")
world_recovered_map.load_javascript()
world_recovered_map.render_notebook()

#### Calculate US time series condirmed cases

In [None]:
def cal_us_series_sum(a_df):
    
    all_number_us_df = a_df.drop(
        columns=[
                 "UID", "iso2", "iso3", "code3", "FIPS", "Admin2", "Province_State", "Country_Region", "Lat", "Long_", "Combined_Key"
                ]
    )
    a_df["Sum"] = all_number_us_df.sum(axis=1)
    return a_df

In [None]:
time_series_covid_19_confirmed_US = cal_us_series_sum(read_file("time_series_covid_19_confirmed_US.csv"))
time_series_covid_19_deaths_US = cal_us_series_sum(read_file("time_series_covid_19_deaths_US.csv"))

#### Combine whole time series data

In [None]:
time_series_covid_19_confirmed = read_file("time_series_covid_19_confirmed.csv")
time_series_covid_19_confirmed_US = read_file("time_series_covid_19_confirmed_US.csv")

lat = time_series_covid_19_confirmed["Lat"].append(time_series_covid_19_confirmed_US["Lat"])
lon = time_series_covid_19_confirmed["Long"].append(time_series_covid_19_confirmed_US["Long_"])
ProvinceState = time_series_covid_19_confirmed["ProvinceState"].astype(str).append(time_series_covid_19_confirmed_US["Province_State"].astype(str))
CountryRegion = time_series_covid_19_confirmed["CountryRegion"].astype(str).append(time_series_covid_19_confirmed_US["Country_Region"].astype(str))
time_series_o = time_series_covid_19_confirmed.loc[:, time_series_covid_19_confirmed.columns.str.contains('20')]
time_series_US = time_series_covid_19_confirmed_US.loc[:, time_series_covid_19_confirmed_US.columns.str.contains('20')]
time_series = time_series_o + time_series_US
time_series.fillna(0, inplace=True)

all_time_series = {
    "CountryRegion": CountryRegion, 
    "ProvinceState": ProvinceState,
    "Latitude": lat,
    "Longitude": lon,
    }
all_time_series_df = pd.DataFrame(all_time_series)
all_time_series_df = all_time_series_df.join(time_series)

all_number_time_series_df = all_time_series_df.drop(
    columns=[
             "ProvinceState", "CountryRegion", "Latitude", "Longitude"
             ]
)
all_time_series_df["Sum"] = all_number_time_series_df.sum(axis=1).astype(int)



In [None]:
all_time_series_df