#### Title: COVID-19 Data Visualisation - Data Wrangling
#### Author: Dariush Riazati

#### Revisiones
- Set increment to zero if today's figure was revised down compared to yesterday.
- Rest null with 0 after left join

### Import and install necessary Python packages

In [7]:
import pandas as pd
import datetime
import sys
#!{sys.executable} -m pip install plotly

### Global Variables

In [8]:
'''
Generate consistent column names for each day that could be later used as valid dates. 
The column names for dates is used for all data sets.
'''
start_date = datetime.datetime(2020, 1, 22)
end_date = datetime.datetime(2020, 4, 13)
oneDay = datetime.timedelta(days=1)

# Generate column names for days to cover
date_col_names = []
while start_date <= end_date:
    col_name = "{}-{}-{}".format(start_date.day, start_date.month, start_date.year)
    start_date += oneDay
    date_col_names.append(col_name)
    
all_columns = ['State', 'Country', 'Lat', 'Long'] + date_col_names

### Re-usable functions

In [9]:
'''
Print columns with null value.
'''
def get_cols_with_null(df):
    # Get a list containing each column and number of null values in each column
    col_list = []
    x = len(df) - df.count()
    for i in range(0, len(all_columns)):
        if x[i] > 0:
            col_list.append(str(all_columns[i]))
            
    return col_list

In [10]:
'''
Transpose the count values for each of the day columns into a a 3 column dataframe containing the id, day value and the count for the day.
'''
def trans_data(input_df, date_col_names):
    col_id =   []
    col_date = []
    col_cnt =  []
    col_acc_cnt =  []
    ncol = len(date_col_names)
    for i in range(0, len(input_df)):
        for j in range(0, ncol):
          col_id.append(input_df.iloc[i][len(date_col_names)])
          col_date.append(date_col_names[j])
          col_acc_cnt.append(input_df.iloc[i][j])
          offset = j + len(date_col_names) + 1
          col_cnt.append(input_df.iloc[i][offset])

    the_dict = {'id':col_id,'the_date':col_date, 'acc_count':col_acc_cnt, 'inc_count':col_cnt}
    out_df = pd.DataFrame(the_dict) 
    return out_df

In [11]:
'''
Rename the columns as we want to make room for same column names to hold the increments. 
Figures/counts as given are accummulative. 
We need increments to be able to perform aggregation at the time of visualization.
'''
def get_increments(in_df, date_col_names):
    for ix2 in range(0, len(date_col_names)): 
        col_name = 'X-' + date_col_names[ix2]
        in_df[col_name] = 0

    for ix1 in range(0, len(in_df)):
        for ix2 in range(0, len(date_col_names)-1): 
            new_col = 'X-' + date_col_names[ix2+1]
            delta = in_df.iloc[ix1][ix2+1] - in_df.iloc[ix1][ix2]
            new_col_no = len(date_col_names) + ix2
            if delta < 0:
                delta = 0
            in_df.loc[ix1].at[new_col] = delta
            
    return in_df

### Read COVID-19 data sets

In [12]:
c1 = pd.read_csv("time_series_covid19_confirmed_global.csv")
d1 = pd.read_csv("time_series_covid19_deaths_global.csv")
r1 = pd.read_csv("time_series_covid19_recovered_global.csv")

### Processing Confirmed data set: Data Wrangling

In [13]:
'''
1. Rename date columns
2. identify columns with a null vale.
3. State happens to be the only one; Replace its missing values with the name of the country;
4. Add an 'id' column with a sequential number (equalling the index.
'''
c1.columns = all_columns
print("Before update: Names of columns with null values: ", get_cols_with_null(c1))
c1.State.fillna(c1.Country, inplace=True)
print("After update: Names of columns with null values: ", get_cols_with_null(c1))
c1['id'] = c1.index 

Before update: Names of columns with null values:  ['State']
After update: Names of columns with null values:  []


In [14]:
c1.columns

Index(['State', 'Country', 'Lat', 'Long', '22-1-2020', '23-1-2020',
       '24-1-2020', '25-1-2020', '26-1-2020', '27-1-2020', '28-1-2020',
       '29-1-2020', '30-1-2020', '31-1-2020', '1-2-2020', '2-2-2020',
       '3-2-2020', '4-2-2020', '5-2-2020', '6-2-2020', '7-2-2020', '8-2-2020',
       '9-2-2020', '10-2-2020', '11-2-2020', '12-2-2020', '13-2-2020',
       '14-2-2020', '15-2-2020', '16-2-2020', '17-2-2020', '18-2-2020',
       '19-2-2020', '20-2-2020', '21-2-2020', '22-2-2020', '23-2-2020',
       '24-2-2020', '25-2-2020', '26-2-2020', '27-2-2020', '28-2-2020',
       '29-2-2020', '1-3-2020', '2-3-2020', '3-3-2020', '4-3-2020', '5-3-2020',
       '6-3-2020', '7-3-2020', '8-3-2020', '9-3-2020', '10-3-2020',
       '11-3-2020', '12-3-2020', '13-3-2020', '14-3-2020', '15-3-2020',
       '16-3-2020', '17-3-2020', '18-3-2020', '19-3-2020', '20-3-2020',
       '21-3-2020', '22-3-2020', '23-3-2020', '24-3-2020', '25-3-2020',
       '26-3-2020', '27-3-2020', '28-3-2020', '29-3-2020', '

In [15]:
'''
5. Create two subsets of the list. 
One that includes all columns to and including longitude, 
and another that includes from the first date column to id.
6. Add increments to accumulative figures.
'''
c11 = pd.concat((c1.loc[:, :'Long'], c1.loc[:, 'id']), axis=1)
c12 = c1.loc[:, '22-1-2020':'id']

c12 = get_increments(c12, date_col_names)

In [16]:
'''
7. Transpose the dataframe around dates.
8. Rename generic column names to specific column names
9. (Left) Join the subset from step 5 with the transposed dates to get the full data set.
'''
c12_frame = trans_data(c12, date_col_names)
c12_frame = c12_frame.rename(columns={'the_date': 'confirmed_date', 'acc_count': 'confirmed_acc_count', 'inc_count': 'confirmed_inc_count'})
df_confirmed = pd.merge(c11, c12_frame, on='id')

In [17]:
df_confirmed.to_csv (r'E:\Personal_Files\Dariush\monash\visual\AS3\\new\df_confirmed.csv', index = False, header=True)

In [18]:
if len(df_confirmed) == len(c1) * (len(date_col_names) ):
   print("Success: The number of rows in the resulting data frame matches the number of rows \
in the original data frame by the number of data")
else:
   print("Something wrong: The number of rows in the resulting data frame does not matches with \
number of rows in the original data frame by the number of data")

Success: The number of rows in the resulting data frame matches the number of rows in the original data frame by the number of data


### Perform same data wrangling for Deaths data set.

In [19]:
'''
1. Rename date columns
2. identify columns with a null vale.
3. State happens to be the only one; Replace its missing values with the name of the country;
4. Add an 'id' column with a sequential number (equalling the index.
'''
d1.columns = all_columns
print("Before update: Names of columns with null values: ", get_cols_with_null(d1))
d1.State.fillna(d1.Country, inplace=True)
print("After update: Names of columns with null values: ", get_cols_with_null(d1))
d1['id'] = d1.index 

Before update: Names of columns with null values:  ['State']
After update: Names of columns with null values:  []


In [20]:
'''
5. Create two subsets of the list. 
   One that includes all columns to and including longitude, 
   and another that includes from the first date column to id.
6. Add increments to accumulative figures.
'''
d11 = pd.concat((d1.loc[:, :'Long'], d1.loc[:, 'id']), axis=1)
d12 = d1.loc[:, '22-1-2020':'id']

d12 = get_increments(d12, date_col_names)

In [21]:
'''
7. Transpose the dataframe around dates.
8. Rename generic column names to specific column names
9. (Left) Join the subset from step 5 with the transposed dates to get the full data set.
'''
d12_frame = trans_data(d12, date_col_names)
d12_frame = d12_frame.rename(columns={'the_date': 'death_date', 'acc_count': 'death_acc_count', 'inc_count': 'death_inc_count'})

df_deaths = pd.merge(d11, d12_frame, on='id')

if len(df_deaths) == len(d1) * len(date_col_names):
   print("Success: The number of rows in the resulting (Deaths) data frame matches the number of rows \
in the original data frame by the number of data")
else:
   print("Something wrong: The number of rows in the resulting (Deaths) data frame does not matches with \
number of rows in the original data frame by the number of data")

Success: The number of rows in the resulting (Deaths) data frame matches the number of rows in the original data frame by the number of data


In [22]:
'''
import plotly.express as px

fig = px.bar(df_deaths, x='death_date', y='death_count',
              color='death_date',
             labels={'Confirmed Cases':'Corona Stats'}, height=400)
fig.show()
'''

"\nimport plotly.express as px\n\nfig = px.bar(df_deaths, x='death_date', y='death_count',\n              color='death_date',\n             labels={'Confirmed Cases':'Corona Stats'}, height=400)\nfig.show()\n"

### Perform same data wrangling process for recovered data set.

In [23]:
'''
1. Rename date columns
2. identify columns with a null vale.
3. State happens to be the only one; Replace its missing values with the name of the country;
4. Add an 'id' column with a sequential number (equalling the index.
'''
r1.columns = all_columns
print("Before update: Names of columns with null values: ", get_cols_with_null(r1))
r1.State.fillna(r1.Country, inplace=True)
print("After update: Names of columns with null values: ", get_cols_with_null(r1))
r1['id'] = r1.index 

Before update: Names of columns with null values:  ['State']
After update: Names of columns with null values:  []


In [24]:
'''
5. Create two subsets of the list. 
   One that includes all columns to and including longitude, 
   and another that includes from the first date column to id.
6. Add increments to accumulative figures.
'''
r11 = pd.concat((r1.loc[:, :'Long'], r1.loc[:, 'id']), axis=1)
r12 = r1.loc[:, '22-1-2020':'id']

r12 = get_increments(r12, date_col_names)

In [25]:
'''
7. Transpose the dataframe around dates.
8. Rename generic column names to specific column names
9. (Left) Join the subset from step 5 with the transposed dates to get the full data set.
'''
r12_frame = trans_data(r12, date_col_names)
r12_frame = r12_frame.rename(columns={'the_date': 'recovered_date', 'acc_count': 'recovered_acc_count', 'inc_count': 'recovered_inc_count'})

df_recovered = pd.merge(r11, r12_frame, on='id')

if len(df_recovered) == len(r1) * len(date_col_names):
   print("Success: The number of rows in the resulting (Recovered) data frame matches the number of rows \
in the original data frame by the number of data")
else:
   print("Something wrong: The number of rows in the resulting (Recovered) data frame does not matches with \
number of rows in the original data frame by the number of data")

Success: The number of rows in the resulting (Recovered) data frame matches the number of rows in the original data frame by the number of data


In [26]:
df_recovered.to_csv (r'E:\Personal_Files\Dariush\monash\visual\AS3\\new\df_recovered.csv', index = False, header=True)

In [27]:
'''
import plotly.express as px

fig = px.bar(df_recovered, x='recovered_date', y='recovered_inc_count',
              color='recovered_date',
             labels={'Recovered Cases':'Corona Stats'}, height=400)
fig.show()
'''

"\nimport plotly.express as px\n\nfig = px.bar(df_recovered, x='recovered_date', y='recovered_inc_count',\n              color='recovered_date',\n             labels={'Recovered Cases':'Corona Stats'}, height=400)\nfig.show()\n"

### Join all three data sets into one data set.

In [28]:
df_confirmed_death = pd.merge(df_confirmed, df_deaths,  how='outer', left_on=['Country', 'State', 'confirmed_date'], right_on=['Country', 'State', 'death_date'])
df_confirmed_death_recovered = pd.merge(df_confirmed_death, df_recovered, how='outer', left_on=['Country', 'State', 'confirmed_date'], right_on=['Country', 'State', 'recovered_date'])

In [29]:
df_final = df_confirmed_death_recovered[['Country', 'State', 'Long', 'Lat', 'confirmed_date', 'confirmed_acc_count', 'confirmed_inc_count', 'death_acc_count', 'death_inc_count', 'recovered_acc_count', 'recovered_inc_count']]

In [30]:
'''
Remove anu null value resulting from left join.
'''
df_final_2 = df_final.fillna(0)

In [31]:
df_final_2.to_csv (r'E:\Personal_Files\Dariush\monash\visual\AS3\\new\time_series_covid19_ALL_global.csv', index = False, header=True)