In [10]:
import datetime
import pandas as pd

#Create a empty dataframe for combining all csvs
integrate_df = pd.DataFrame()

#Define start date and end date, which is the date range of the datasource
start = datetime.date(2021, 1, 1)
end = datetime.date(2023, 3, 9)
res_date = start

#loop each date within the range to extract the csv for the day, and do data cleaning
while res_date <= end:
    #print(res_date)
    #Turn the datetime object into string for url concatenation
    string_date = res_date.strftime("%m-%d-%Y")
    
    #url of github raw content
    url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/" + string_date + ".csv"
    # print(url)
    
    #Add 1 day to retrieve the next day's data
    res_date += datetime.timedelta(days=1)

#Data cleaning of csv for each day
    #Read csv file
    df1 = pd.read_csv(url)#.fillna(0) Since there is no NaN data, fillna function is commented
    
    #Only select columns that are needed, and extract China only
    df2 = df1[df1["Country_Region"] == "China"][["Country_Region", "Last_Update","Confirmed","Deaths","Recovered"]]
    
    #Group by country to obtain the overall data of China
    df = df2.groupby(['Country_Region','Last_Update'], as_index = False).agg('sum')
    
    #Modify the date format for analysis
    df['Last_Update'] = pd.to_datetime(df['Last_Update'])
    #Why df['Last_Update'] = pd.to_datetime(df['Last_Update'],format='%Y-%m-%d') would encounter error?

    df['Date'] = df['Last_Update'] - pd.Timedelta(1,"d")

    #Format the dates
    df['Last_Update'] = df['Last_Update'].dt.strftime('%Y-%m-%d')
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')

    #Concat the csv files together
    integrate_df = pd.concat([integrate_df, df], ignore_index=True)

#Drop duplicates
integrate_df.drop_duplicates(subset = ['Date'],inplace = True)
integrate_df.reset_index()
    

integrate_df.head()
  
    

Unnamed: 0,Country_Region,Last_Update,Confirmed,Deaths,Recovered,Date
0,China,2021-01-02,102649,4783,90031.0,2021-01-01
1,China,2021-01-03,102731,4784,90099.0,2021-01-02
2,China,2021-01-04,102808,4784,90159.0,2021-01-03
3,China,2021-01-05,102930,4785,90213.0,2021-01-04
4,China,2021-01-06,103026,4787,90306.0,2021-01-05


In [5]:
integrate_df.info

<bound method DataFrame.info of     Country_Region Last_Update  Confirmed  Deaths  Recovered
0            China  2021-01-02     102649    4783    90031.0
1            China  2021-01-03     102731    4784    90099.0
2            China  2021-01-04     102808    4784    90159.0
3            China  2021-01-05     102930    4785    90213.0
4            China  2021-01-06     103026    4787    90306.0
..             ...         ...        ...     ...        ...
793          China  2023-03-06    4903524  101054        0.0
794          China  2023-03-07    4903524  101055        0.0
795          China  2023-03-08    4903524  101055        0.0
796          China  2023-03-09    4903524  101055        0.0
797          China  2023-03-10    4903524  101056        0.0

[798 rows x 5 columns]>

In [9]:
integrate_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 798 entries, 0 to 797
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country_Region  798 non-null    object 
 1   Last_Update     798 non-null    object 
 2   Confirmed       798 non-null    int64  
 3   Deaths          798 non-null    int64  
 4   Recovered       798 non-null    float64
dtypes: float64(1), int64(2), object(2)
memory usage: 31.3+ KB
