In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# COVID-19 active worldwide situation in 12.2020

### Author: Ahmed Lotfi Alqnatri (2020)

#### Origin data source: https://github.com/CSSEGISandData/COVID-19

In [None]:
import numpy as np 
import pandas as pd 
import datetime
import math
import operator
import random
import time

import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline

# 1. EDA: Global Data

In [None]:
df = pd.read_csv('./data-covid19/covid_19_data.csv')

df = df.rename(columns={'Country/Region':'Country'})
df = df.rename(columns={'ObservationDate':'Date'})

df['Country'] = df['Country'].replace('Mainland China', 'China')
df['Active_cases'] = df['Confirmed'] - df['Deaths'] - df['Recovered']

df.head()

In [None]:
last_cases_update = df[df['Date'] == max(df['Date'])].reset_index()
data = last_cases_update.groupby(['Date']).sum(numeric_only=True).reset_index()
number_of_cases = data[['Confirmed','Recovered', 'Deaths', 'Active_cases', 'Date']]
number_of_cases 

In [None]:
df_cases_per_countries = df.groupby(['Country', 'Date']).sum(numeric_only=True).reset_index().sort_values('Date', ascending=False)
df_cases_per_countries = df_cases_per_countries.drop_duplicates(subset = ['SNo'])
df_cases_per_countries = df_cases_per_countries.drop_duplicates(subset = ['Country'])
df_cases_per_countries = df_cases_per_countries[df_cases_per_countries['Confirmed']!=0]
df_cases_per_countries

In [None]:
df2 = df.groupby("Date").sum(numeric_only=True).reset_index().sort_values(by=['Date'], ascending=False).head(100)
df2 = df2.sort_values(by=['Date'])

In [None]:
plt.figure(figsize=(12, 5))
plt.xticks(rotation=90)

sns.lineplot(x=df2['Date'], y=df2['Confirmed'].tolist(), label = 'CONFIRMED')
sns.lineplot(x=df2['Date'], y=df2['Deaths'].tolist(), label = 'DEATH')
sns.lineplot(x=df2['Date'], y=df2['Recovered'].tolist(), label = 'RECOVERED')

plt.xlabel('Date'); plt.ylabel('Cases Number'); plt.title('All Cases Over Time')
plt.legend();

In [None]:
## https://towardsdatascience.com/visualizing-the-coronavirus-pandemic-with-choropleth-maps-7f30fccaecf5 ##

df_countries = df.groupby(['Country', 'Date']).sum(numeric_only=True).reset_index().sort_values('Date', ascending=False)
df_countries = df_countries.drop_duplicates(subset = ['Country'])
df_countries = df_countries[df_countries['Confirmed']>0]

# Create visualization
fig = go.Figure(data=go.Choropleth(
    locations = df_cases_per_countries['Country'],
    locationmode = 'country names',
    z = df_cases_per_countries['Confirmed'],
    colorscale = 'Reds',
    marker_line_color = 'black',
    marker_line_width = 0.5,
))
fig.update_layout(
    title_text = 'Confirmed Cases',
    title_x = 0.5,
    geo=dict(
        showframe = False,
        showcoastlines = False,
        projection_type = 'equirectangular'
    )
)

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations = df_cases_per_countries['Country'],
    locationmode = 'country names',
    z = df_cases_per_countries['Recovered'],
    colorscale = 'Reds',
    marker_line_color = 'black',
    marker_line_width = 0.5,
))
fig.update_layout(
    title_text = 'Recovered Cases',
    title_x = 0.5,
    geo=dict(
        showframe = False,
        showcoastlines = False,
        projection_type = 'equirectangular'
    )
)

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations = df_cases_per_countries['Country'],
    locationmode = 'country names',
    z = df_cases_per_countries['Deaths'],
    colorscale = 'Reds',
    marker_line_color = 'black',
    marker_line_width = 0.5,
))
fig.update_layout(
    title_text = 'Dead Cases',
    title_x = 0.5,
    geo=dict(
        showframe = False,
        showcoastlines = False,
        projection_type = 'equirectangular'
    )
)

In [None]:
df1 = df.groupby('Date').sum(numeric_only=True).reset_index().head(10)
fig = plt.figure(figsize=(6, 4))

ax1 = fig.add_axes([0,0,1,1])
ax2 = fig.add_axes([0.05,0.65,0.5,0.3])

ax1.set_title("Confirmed cases Death cases over time top 10")
ax1.plot(df1['Date'], df1['Confirmed'], color='blue')

ax2.plot(df1['Date'], df1['Deaths'], color='red')
plt.xticks(rotation=90)

# 2. EDA: Time series data

In [None]:
df_confirmed = pd.read_csv('./data-covid19/time_series_covid_19_confirmed.csv')
# df_confirmed.head()

df_deaths = pd.read_csv('./data-covid19/time_series_covid_19_deaths.csv')
# df_deaths.head()

df_recovered = pd.read_csv('./data-covid19/time_series_covid_19_recovered.csv')
# df_recovered.head()

In [None]:
df_confirmed_dates = df_confirmed.iloc[:, 4:-1]
df_deaths_dates = df_deaths.iloc[:, 4:-1]
df_recovered_dates = df_recovered.iloc[:, 4:-1]

dates = df_confirmed_dates.keys()
total_confirmed_cases = []
total_deaths = []
mortality_rate = []
total_recovered = []

for date in dates:
    confirmed_sum = df_confirmed_dates[date].sum()
    death_sum = df_deaths_dates[date].sum()
    recovered_sum = df_recovered_dates[date].sum()
    total_confirmed_cases.append(confirmed_sum)
    total_deaths.append(death_sum)
    mortality_rate.append(death_sum/confirmed_sum)
    total_recovered.append(recovered_sum)
    
print(f' Total Confirmed Cases: {confirmed_sum}, Total Deaths Cases: {death_sum}, Total Recovered Cases: {recovered_sum}')

In [None]:
# reshaping the lists
total_confirmed_cases_np = np.array(total_confirmed_cases).reshape(-1,1)
total_deaths_np = np.array(total_deaths).reshape(-1,1)
total_recovered_np = np.array(total_recovered).reshape(-1,1)
dates_list_np = np.array([date for date in range(len(dates))]).reshape(-1,1)

In [None]:
forcast_of_next_month_np = np.array([date for date in range(len(dates)+ 30)]).reshape(-1,1)
# forcast_of_next_month_np

# forecasting for the next month
forcast_of_next_month_dates = []
for col in range(len(forcast_of_next_month_np)):
    forcast_of_next_month_dates.append((datetime.datetime.strptime('2020-01-22', '%Y-%m-%d') + datetime.timedelta(days=col)).strftime('%Y-%m-%d'))

# forecast_of_next_month_dates

In [None]:
List_of_countries = list(df_confirmed['Country/Region'].unique())
# List_of_countries

In [None]:
# total_confirmed_cases_per_country
y = df_confirmed.groupby("Country/Region").sum(numeric_only=True).reset_index()
total_confirmed_cases_per_country = y.iloc[:,[0] + [-1]]
total_confirmed_cases_per_country = total_confirmed_cases_per_country.rename(columns={"Country/Region":"Country","12/6/20":"Cases"})
total_confirmed_cases_per_country['Country'].replace('Mainland China', 'China')
total_confirmed_cases_per_country

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations = total_confirmed_cases_per_country['Country'],
    locationmode = 'country names',
    z = total_confirmed_cases_per_country['Cases'],
    colorscale = 'Reds',
    marker_line_color = 'black',
    marker_line_width = 0.5,
))
fig.update_layout(
    title_text = 'Confirmed Cases',
    title_x = 0.5,
    geo=dict(
        showframe = False,
        showcoastlines = False,
        projection_type = 'equirectangular'
    )
)

In [None]:
# total_recovered_cases_per_country
y1 = df_recovered.groupby("Country/Region").sum(numeric_only=True).reset_index()
total_recovered_cases_per_country = y1.iloc[:,[0] + [-1]]
total_recovered_cases_per_country = total_recovered_cases_per_country.rename(columns={"Country/Region":"Country","12/6/20":"Cases"})
total_recovered_cases_per_country['Country'].replace('Mainland China', 'China')
total_recovered_cases_per_country

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations = total_recovered_cases_per_country['Country'],
    locationmode = 'country names',
    z = total_recovered_cases_per_country['Cases'],
    colorscale = 'Reds',
    marker_line_color = 'black',
    marker_line_width = 0.5,
))
fig.update_layout(
    title_text = 'Recovered Cases',
    title_x = 0.5,
    geo=dict(
        showframe = False,
        showcoastlines = False,
        projection_type = 'equirectangular'
    )
)

In [None]:
# total_deaths_cases_per_country
y2 = df_deaths.groupby("Country/Region").sum(numeric_only=True).reset_index()
total_deaths_cases_per_country = y2.iloc[:,[0] + [-1]]
total_deaths_cases_per_country = total_deaths_cases_per_country.rename(columns={"Country/Region":"Country","12/6/20":"Cases"})
total_deaths_cases_per_country['Country'].replace('Mainland China', 'China')
total_deaths_cases_per_country

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations = total_deaths_cases_per_country['Country'],
    locationmode = 'country names',
    z = total_deaths_cases_per_country['Cases'],
    colorscale = 'Reds',
    marker_line_color = 'black',
    marker_line_width = 0.5,
))
fig.update_layout(
    title_text = 'Deaths Cases',
    title_x = 0.5,
    geo=dict(
        showframe = False,
        showcoastlines = False,
        projection_type = 'equirectangular'
    )
)

# 3. Modeling

In [None]:
# splitting the data
from sklearn.model_selection import train_test_split

X_train_confirmed, X_test_confirmed, y_train_confirmed, y_test_confirmed = train_test_split(dates_list_np, total_confirmed_cases_np, test_size =0.20, random_state = 0)
X_train_recovered, X_test_recovered, y_train_recovered, y_test_recovered = train_test_split(dates_list_np, total_recovered_np, test_size =0.20, random_state = 0)

## 3.1 Linear Regression

In [None]:
# build and train the model for comfiremed cases
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

model = LinearRegression(fit_intercept=True)
model.fit(X_train_confirmed, y_train_confirmed)

test_pred = model.predict(X_test_confirmed)
pred = model.predict(forcast_of_next_month_np)

print('Mean Absolute Error: ', mean_absolute_error(test_pred, y_test_confirmed))
print('Mean Squared Error: ', mean_squared_error(test_pred, y_test_confirmed))

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(y_test_confirmed)
plt.plot(test_pred)

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(dates_list_np, total_confirmed_cases)
plt.plot(forcast_of_next_month_np, pred, linestyle='dashed', color='purple')
plt.title('Number of Confirmed cases vs predicted')
plt.xlabel('Days')
plt.ylabel('Number of cases')
plt.legend(['Confirmed cases', 'Predicted cases'])
plt.show()

In [None]:
# building model for recovered cases
model_1 = LinearRegression(fit_intercept=True)
model_1.fit(X_train_recovered, y_train_recovered)

test_pred_recovery = model_1.predict(X_test_recovered)
pred_recovery = model_1.predict(forcast_of_next_month_np)

print('Mean Absolute Error: ', mean_absolute_error(test_pred_recovery, y_test_recovered))
print('Mean Squared Error: ', mean_squared_error(test_pred_recovery, y_test_recovered))

In [None]:
plt.plot(y_test_recovered)
plt.plot(test_pred_recovery)
plt.legend(['Recovered cases', 'Predicted cases'])

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(dates_list_np, total_recovered)
plt.plot(forcast_of_next_month_np, pred_recovery, linestyle='dashed', color='purple')
plt.title('Number of Recovered cases vs Prediction')
plt.xlabel('Days')
plt.ylabel('Number of cases')
plt.legend(['Recovered cases', 'Predicted cases'])
plt.show()

## 3.2 Support Vector Regression (SVR)

In [None]:
from sklearn.svm import SVR

svm_confirmed = SVR(shrinking=True, kernel='poly',gamma=0.01, epsilon=1,degree=3, C=0.1)
svm_confirmed.fit(X_train_confirmed, y_train_confirmed.ravel())

svm_pred = svm_confirmed.predict(forcast_of_next_month_np)
svm_test_pred = svm_confirmed.predict(X_test_confirmed)

print('MAE:', mean_absolute_error(svm_test_pred, y_test_confirmed))
print('MSE:',mean_squared_error(svm_test_pred, y_test_confirmed))

In [None]:
# Plotting prediction vs y_test graph
plt.figure(figsize=(6, 4))
plt.plot(y_test_confirmed)
plt.plot(svm_test_pred)
plt.title("SVM prediction vs y_test")
plt.legend(['Confirmed cases', 'SVM Predictions'])

## 3.3 Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# transforming our data for polynomial regression
poly = PolynomialFeatures(degree=5)

poly_x_train = poly.fit_transform(X_train_confirmed)
poly_x_test = poly.transform(X_test_confirmed)
poly_future_forcasting = poly.transform(forcast_of_next_month_np)

In [None]:
# building Linear Regression model
classifier = LinearRegression(fit_intercept=False)

classifier.fit(poly_x_train, y_train_confirmed)
test_classifier_pred = classifier.predict(poly_x_test)
classifier_pred = classifier.predict(poly_future_forcasting)

print('MAE:', mean_absolute_error(test_classifier_pred, y_test_confirmed))
print('MSE:',mean_squared_error(test_classifier_pred, y_test_confirmed))
print(classifier.coef_)

In [None]:
# Plotting test data vs polynomial regression prediction
plt.figure(figsize=(6, 4))
plt.plot(y_test_confirmed)
plt.plot(test_classifier_pred)
plt.title('Test Data vs Polynomial Regression Predictions')
plt.legend(['Test Data', 'Polynomial Regression Predictions'])

# 4. Moving average visualization

In [None]:
# calculating moving increase
def moving_average(data, window_size):
    moving_average = []
    for i in range(len(data)):
        if i + window_size < len(data):
            moving_average.append(np.mean(data[i:i+window_size]))
        else:
            moving_average.append(np.mean(data[i:len(data)]))
    return moving_average

In [None]:
# visualizing rate of confirmed cases worldwide vs moving average
adjusted_dates = forcast_of_next_month_np[:-10]
adjusted_dates = adjusted_dates.reshape(1, -1)[0]
adjusted_dates_1 = adjusted_dates[0:319]
world_confirmed_avg = moving_average(total_confirmed_cases, 16)

plt.figure(figsize=(6, 4))
plt.plot(adjusted_dates_1, total_confirmed_cases)
plt.plot(adjusted_dates_1, world_confirmed_avg, linestyle='dashed', color='blue')
plt.title('Rate of Confirmed Coronavirus world Cases Over Time')
plt.xlabel('Days Since 1/22/2020')
plt.ylabel('Rate of Cases')
plt.legend(['Worldwide Confirmed Coronavirus Cases', 'Moving Average 30 Days'], prop={'size': 10})
plt.show()

In [None]:
# visualizing rate of recovered cases worldwide vs moving average
adjusted_dates = forcast_of_next_month_np[:-10]
adjusted_dates = adjusted_dates.reshape(1, -1)[0]
adjusted_dates_1 = adjusted_dates[0:319]
world_recovered_avg = moving_average(total_recovered, 16)

plt.figure(figsize=(6, 4))
plt.plot(adjusted_dates_1, total_recovered)
plt.plot(adjusted_dates_1, world_recovered_avg, linestyle='dashed', color='blue')
plt.title('Rate of Recovered Coronavirus world Cases Over Time')
plt.xlabel('Days Since 1/22/2020')
plt.ylabel('Rate of Cases')
plt.legend(['Worldwide Recovered Coronavirus Cases', 'Moving Average 30 Days'], prop={'size': 10})
plt.show()

In [None]:
# visualizing rate of death cases worldwide vs moving average
adjusted_dates = forcast_of_next_month_np[:-10]
adjusted_dates = adjusted_dates.reshape(1, -1)[0]
adjusted_dates_1 = adjusted_dates[0:319]
world_deaths_avg = moving_average(total_deaths, 16)

plt.figure(figsize=(6, 4))
plt.plot(adjusted_dates_1, total_deaths)
plt.plot(adjusted_dates_1, world_deaths_avg, linestyle='dashed', color='blue')
plt.title('Rate of Deaths Coronavirus world Cases Over Time')
plt.xlabel('Days Since 1/22/2020')
plt.ylabel('Rate of Cases')
plt.legend(['Worldwide Deaths Coronavirus Cases', 'Moving Average 30 Days'], prop={'size': 10})
plt.show()