In [3]:
import datetime
import pandas as pd

#Create a empty dataframe for combining all csvs
df_list = []

#Define start date and end date, which is the date range of the datasource
start = datetime.date(2021, 1, 1)
end = datetime.date(2023, 3, 9)
res_date = start

#loop each date within the range to extract the csv for the day, and do data cleaning
while res_date <= end:
    #Turn the datetime object into string for url concatenation
    string_date = res_date.strftime("%m-%d-%Y")
    
    #url of github raw content
    url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/" + string_date + ".csv"
    # print(url)
    
    #Add 1 day to retrieve the next day's data
    res_date += datetime.timedelta(days=1)


    #Read csv file
    df1 = pd.read_csv(url)#.fillna(0) Since there is no NaN data, fillna function is commented

    #df1.assign(Data = string_date) Doesn't work, "['Date'] not in index"
    df1['Date'] = pd.to_datetime(string_date)
    
    df_list.append(df1)
    
#Concat the csv files together
integrate_df = pd.concat(df_list, ignore_index = True)

#Only select columns that are needed, and extract China only
integrate_df = integrate_df[integrate_df["Country_Region"] == "China"][["Province_State","Country_Region", "Date","Confirmed","Deaths","Recovered"]]

#Group by country to obtain the overall data of China (not needed in this case)
integrate_df = integrate_df.groupby(['Country_Region','Province_State','Date'], as_index = False).agg('sum')

integrate_df.reset_index()

integrate_df.head()

Unnamed: 0,Country_Region,Province_State,Date,Confirmed,Deaths,Recovered
0,China,Anhui,2021-01-01,993,6,986.0
1,China,Anhui,2021-01-02,993,6,986.0
2,China,Anhui,2021-01-03,993,6,986.0
3,China,Anhui,2021-01-04,993,6,986.0
4,China,Anhui,2021-01-05,993,6,986.0


In [4]:
integrate_df['Recovered'] = integrate_df['Recovered'].astype('int')
integrate_df

Unnamed: 0,Country_Region,Province_State,Date,Confirmed,Deaths,Recovered
466,China,Anhui,2022-04-12,1056,6,0.0
1264,China,Beijing,2022-04-12,1826,9,0.0
2062,China,Chongqing,2022-04-12,693,6,0.0
2860,China,Fujian,2022-04-12,2929,1,0.0
3658,China,Gansu,2022-04-12,681,2,0.0
4456,China,Guangdong,2022-04-12,6749,8,0.0
5254,China,Guangxi,2022-04-12,1532,2,0.0
6052,China,Guizhou,2022-04-12,178,2,0.0
6850,China,Hainan,2022-04-12,280,6,0.0
7648,China,Hebei,2022-04-12,1982,7,0.0


In [None]:
#The mode is default 'w', which means would truncate the file first if detecting a file with the specific name
integrate_df.to_csv("covid_data_china_Province.csv")