In [None]:
import geopandas as gpd
import geoplot as gplt
import pandas as pd 

## Pre-work: combine two datasets and produce a grouped dataset

In [None]:
gdf_chprov = gpd.read_file('gadm36_CHN_1.json') # from 
df_chncovd = pd.read_csv('COVID-19_geo_timeseries_ver_0311.csv') # from https://www.kaggle.com/datasets/lihyalan/2020-corona-virus-timeseries
df_chncovd = df_chncovd[df_chncovd['country'] == 'China'].drop(columns=['country_flag'])
# gdf_chprov

In [None]:
provgrp = df_chncovd.groupby(by='province')
provgrp

In [None]:
gdf_chprov['NAME_1'] = gdf_chprov['NAME_1'].replace(
    {'Ningxia Hui': 'Ningxia', 'Xinjiang Uygur': 'Xinjiang','Xizang':'Tibet','Nei Mongol':'Inner Mongolia'}
)

In [None]:
provinces = df_chncovd['province'].unique()
geo_povs = gdf_chprov['NAME_1'].unique()
for prov in provinces:
    if prov not in geo_povs:
        print(prov)

In [None]:
for prov in geo_povs:
    if prov not in provinces:
        print(prov)

In [None]:
gdf_chprov

In [None]:
df_chncovd['update_time'] = pd.to_datetime(df_chncovd['update_time'])

In [None]:
dfhk = df_chncovd[df_chncovd['province'] == 'Hong Kong'].sort_values(by='update_time')
dfhk.head()

In [None]:
dfhk['data_source'].value_counts()

In [None]:
import matplotlib.pyplot as plt

In [None]:
dfjhu = dfhk[dfhk['data_source'] == 'jhu']
plt.figure(figsize=(14,7))
plt.plot(dfjhu['update_time'], dfjhu['confirmed_cases'])
plt.xlabel('Date')
plt.ylabel('Confirmed Cases')
plt.show()

### Exercise 1 Create a linechart showing the confirmed trendline of Shanghai and Hong Kong of jhu, try adding legend to make clear which is which (2 mins)

In [None]:
dfhk = df_chncovd[df_chncovd['province'] == 'Hong Kong'].sort_values(by='update_time')
dfsh = df_chncovd[df_chncovd['province'] == 'Shanghai'].sort_values(by='update_time')

dfhkjhu = dfhk[dfhk['data_source'] == 'jhu']
dfshjhu = dfsh[dfsh['data_source'] == 'jhu']
plt.figure(figsize=(14,7))
plt.plot(dfhkjhu['update_time'], dfhkjhu['confirmed_cases'], label='Hong Kong')
plt.plot(dfshjhu['update_time'], dfshjhu['confirmed_cases'], label='Shanghai')
plt.xlabel('Date')
plt.ylabel('Confirmed Cases')
plt.legend()
plt.show()

## Introduction to Geopandas and Shapely
Based on Geopandas documents from https://residentmario.github.io/geoplot/user_guide/Working_with_Geospatial_Data.html

In [None]:
gdf_chprov.geometry

In [None]:
geosrs = gdf_chprov.geometry

In [None]:
geosrs.to_file(
    'chnprov.geojson',
    driver='GeoJSON'
)

In [None]:
gdf_chprov.crs

In [None]:
gpd.read_file('chnprov.geojson')

In [None]:
import fiona
fiona.supported_drivers

In [None]:
geosrs.to_file(
    'geoshp.shp',
    driver='ESRI Shapefile'
)

In [None]:
gpd.read_file('geoshp.shp')

### More on the Geometric Processing
- Spherical Coordinates
- Cartesian Coordinates
<br> Parallel posture

In [None]:
nyc_map_pluto_sample = gpd.read_file(gplt.datasets.get_path('nyc_map_pluto_sample'))
nyc_map_pluto_sample

In [None]:
nyc_map_pluto_sample.crs

In [None]:
gdf_chprov.head(4)

In [None]:
thepoly = gdf_chprov.geometry[4]
thecvh = thepoly.convex_hull

In [None]:
thepoly

In [None]:
type(thepoly)

- Point
- MultiPoint
- LineString
- Polygon
- MultiPolygon

In [None]:
thecvh

In [None]:
import geoplot.crs as gcrs

ctr = gdf_chprov.geometry[0].centroid
poly = gdf_chprov.geometry[0]

ax = gplt.pointplot(
    gpd.GeoDataFrame(geometry=[thepoly.centroid, thecvh.centroid]),
    figsize=(7,7)
)

ax = gplt.polyplot(
    gpd.GeoDataFrame(geometry=[thepoly, thecvh]),
    ax=ax
)
plt.show()

In [None]:
# gdf_chprov[gdf_chprov['NAME_1'].isin(['Shanghai', 'Jiangsu'])].geometry

In [None]:
Shply = gdf_chprov[gdf_chprov['NAME_1'].isin(['Shanghai', 'Jiangsu'])].geometry[23]
Jsply = gdf_chprov[gdf_chprov['NAME_1'].isin(['Shanghai', 'Jiangsu'])].geometry[14]

In [None]:
Shply

In [None]:
Jsply

In [None]:
gplt.polyplot(
    gpd.GeoDataFrame(geometry=[Shply, Jsply]),
    figsize=(7,7)
)
plt.show()

In [None]:
from shapely.ops import triangulate

In [None]:
Shply

In [None]:
# thepoly.contains(thecvh.centroid)
gplt.polyplot(
    gpd.GeoDataFrame(geometry=triangulate(Shply)),
    figsize=(7,7)
)
plt.show()

## Make Choropleth for the total number of confirmed cases

In [None]:
import numpy as np

In [None]:
df_chncovd

In [None]:
# df_chncovd['update_time'].describe()
# First we need to find such data for each provinces:
df_confirmed = df_chncovd.groupby(by='province').apply(lambda df: df[df['update_time']== max(df['update_time'])])[
    ['confirmed_cases','deaths','recovered','province']
]

In [None]:
df_confirmed = df_confirmed[df_confirmed['confirmed_cases']>0].set_index('province')

In [None]:
gdf_ch = pd.concat(
    [gdf_chprov.set_index('NAME_1'), df_confirmed], axis=1
)
gdf_ch = gdf_ch.dropna()

In [None]:
fig, ax = plt.subplots(figsize=(18,10))
gdf_ch['cfmed_log'] = np.log(gdf_ch['confirmed_cases'])
gplt.choropleth(
    gdf_ch,
    hue='cfmed_log',
    legend=True,
    ax=ax
)
plt.title("Log Confirmed Cases", fontsize=20)
plt.show()

### Exercise 2: Create a choropleth of confirmed death using log scale 