In [120]:
import requests
import numpy as np
import pandas as pd

### Fetch Raw Data from the API

In [121]:
raw_data_url = "http://127.0.0.1:5000/raw_data"
response = requests.get(raw_data_url)
print(response.status_code)

200


In [122]:
data = pd.DataFrame(response.json())
data.tail()

Unnamed: 0,country,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,total_deaths_per_million,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,total_tests,new_tests,positive_rate,population,median_age,gdp_per_capita
378040,South Africa,2024-12-01,,,,,,,,,,,,,,62378419.0,27.919,13478.754
378041,South Africa,2024-12-08,,,,,,,,,,,,,,62378419.0,27.919,13478.754
378042,South Africa,2024-12-15,,,,,,,,,,,,,,62378419.0,27.919,13478.754
378043,South Africa,2024-12-22,,,,,,,,,,,,,,62378419.0,27.919,13478.754
378044,South Africa,2024-12-29,,,,,,,,,,,,,,62378419.0,27.919,13478.754


In [123]:
print("Rows: {}\nColumns: {}".format(*data.shape))

Rows: 10753
Columns: 18


### Handling Missing Values

In [124]:
data.isnull().sum().sort_values(ascending=False)

new_vaccinations            8403
people_fully_vaccinated     8297
people_vaccinated           8234
total_vaccinations          8082
new_tests                   7773
total_tests                 7638
positive_rate               7339
gdp_per_capita              1792
total_deaths_per_million      31
total_cases_per_million       31
new_deaths                    31
total_deaths                  31
new_cases                     31
total_cases                   31
date                           0
population                     0
median_age                     0
country                        0
dtype: int64

We see that there is a lot of missing values in most columns.<br>

Most features related to vaccinations are missing so we will drop them.

In [125]:
vaccinations_cols = ['new_vaccinations','people_fully_vaccinated','people_vaccinated','total_vaccinations']
data.drop(vaccinations_cols,axis=1,inplace=True)

Same thing for test related features.

In [126]:
test_cols = ['new_tests','total_tests','positive_rate']
data.drop(test_cols,axis=1,inplace=True)

**gdp_per_capita** is same in all dates for each country.

In [127]:
data.gdp_per_capita.nunique()

5

We can see that only **greenland** country doesnt have gdp_per_capita.<br>

In [128]:
data[data['gdp_per_capita'].isnull()].country.unique()

array(['Greenland'], dtype=object)

I will get it from ethernet and fill it:
https://data.worldbank.org/indicator/NY.GDP.PCAP.CD?locations=GL
#### Note:
this is the gdp_per_capita in 2020 not currently.

In [129]:
data.loc[data['gdp_per_capita'].isnull(),'gdp_per_capita'] = 54_693

In [130]:
rest_cols = ['total_deaths_per_million', 'total_cases_per_million', 'new_deaths', 'total_deaths', 'new_cases', 'total_cases']
dates = [data[data[col].isnull()].date.unique() for col in rest_cols]
np.unique(dates)

array(['2020-01-01', '2020-01-02', '2020-01-03', '2024-11-30',
       '2024-12-01', '2024-12-08', '2024-12-15', '2024-12-22',
       '2024-12-29', '2024-12-31'], dtype=object)

We see that for all columns with 31 missing values are is same dates.<br>
We will fill rows with dates 2020/01/01 to 2020/01/03 with 0 cause they are in begining of the date interval in this dataset.<br>
And for at the end of 2024 we will fill them with values before.

In [131]:
data.loc[data['date'] <= '2020-01-03',rest_cols] = 0

In [132]:
for i in range(len(rest_cols)):
    data[rest_cols[i]] = data[rest_cols[i]].fillna(method='ffill')

  data[rest_cols[i]] = data[rest_cols[i]].fillna(method='ffill')


In [133]:
data.isnull().sum()

country                     0
date                        0
total_cases                 0
new_cases                   0
total_deaths                0
new_deaths                  0
total_cases_per_million     0
total_deaths_per_million    0
population                  0
median_age                  0
gdp_per_capita              0
dtype: int64

Everything is cleaned now.

### Handling Duplicates

In [134]:
data.duplicated().unique()

array([False])

There is no duplicates.

### Feature Engineering

In [135]:
data.columns

Index(['country', 'date', 'total_cases', 'new_cases', 'total_deaths',
       'new_deaths', 'total_cases_per_million', 'total_deaths_per_million',
       'population', 'median_age', 'gdp_per_capita'],
      dtype='object')

- **new_cases / total_cases**: the proportion of new cases compared to the total reported cases on a given day.
- **new_deaths / total_deaths**: the proportion of new deaths compared to the total reported cases on a given day.
- **total_deaths_per_million / population**:  the fraction of total deaths normalized by population size.
- **total_cases_per_million / population**: the fraction of total deaths normalized by population size.
- **new_deaths / new_cases**: the proportion of daily reported cases that have resulted in death.
- **total_deaths / total_cases**: the proportion of total reported cases that have resulted in death.

In [136]:
epsilon = 1e-10
data['new_cases_ratio'] = data['new_cases'] / (data['total_cases']+epsilon) * 100
data['new_deaths_ratio'] = data['new_deaths'] / (data['total_deaths']+epsilon) * 100
data['deaths_per_million_ratio'] = data['total_deaths_per_million'] / (data['population']+epsilon) * 100
data['case_per_million_ratio'] = data['total_cases_per_million'] / (data['population']+epsilon) * 100
data['daily_case_fatality_rate'] = data['new_deaths'] / (data['new_cases']+epsilon) * 100
data['total_case_fatality_rate'] = data['total_deaths'] / (data['total_cases']+epsilon) * 100

In [137]:
# Check Random Obseration
data.iloc[110]

country                        Algeria
date                        2020-04-20
total_cases                     2629.0
new_cases                         95.0
total_deaths                     376.0
new_deaths                         8.0
total_cases_per_million      57.808945
total_deaths_per_million      8.267844
population                  45477391.0
median_age                      27.983
gdp_per_capita               11198.233
new_cases_ratio               3.613541
new_deaths_ratio               2.12766
deaths_per_million_ratio      0.000018
case_per_million_ratio        0.000127
daily_case_fatality_rate      8.421053
total_case_fatality_rate     14.302016
Name: 5479, dtype: object

### Upload the Processed Data to the API

In [138]:
# Convert the DataFrame to Json format for transmition
data_json = data.to_json()

In [139]:
processed_data_url = "http://127.0.0.1:5000/processed_data"

response = requests.post(processed_data_url,json=data_json)
print(response.status_code)

201
