In [1]:
import prophet

In [2]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.offline import iplot, init_notebook_mode
import plotly.figure_factory as ff

import datetime

# LOADING THE DATA

In [6]:
covid_data = pd.read_csv('covid_19_india.csv')
world_data = pd.read_csv('covid_19_clean_complete.csv')

# DATA OF INDIA (COVID)

In [6]:
covid_data.head()

Unnamed: 0,Date,State/UnionTerritory,Cured,Deaths,Confirmed
0,2020-01-30,Kerala,0,0,1
1,2020-01-31,Kerala,0,0,1
2,2020-02-01,Kerala,0,0,2
3,2020-02-02,Kerala,0,0,3
4,2020-02-03,Kerala,0,0,3


In [None]:
# Checking number of rows and columns
covid_data.shape

(18110, 9)

In [11]:
# Checking the datafreame information
covid_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17155 entries, 0 to 18109
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       17155 non-null  datetime64[ns]
 1   State      17155 non-null  object        
 2   Cured      17155 non-null  int64         
 3   Deaths     17155 non-null  int64         
 4   Confirmed  17155 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 804.1+ KB


In [7]:
# Renaming the columns for easier access
covid_data = covid_data.rename(columns={
    'State/UnionTerritory': 'State'
})

In [None]:
# Checking the State names
covid_data['State'].unique()

array(['Kerala', 'Telengana', 'Delhi', 'Rajasthan', 'Uttar Pradesh',
       'Haryana', 'Ladakh', 'Tamil Nadu', 'Karnataka', 'Maharashtra',
       'Punjab', 'Jammu and Kashmir', 'Andhra Pradesh', 'Uttarakhand',
       'Odisha', 'Puducherry', 'West Bengal', 'Chhattisgarh',
       'Chandigarh', 'Gujarat', 'Himachal Pradesh', 'Madhya Pradesh',
       'Bihar', 'Manipur', 'Mizoram', 'Andaman and Nicobar Islands',
       'Goa', 'Unassigned', 'Assam', 'Jharkhand', 'Arunachal Pradesh',
       'Tripura', 'Nagaland', 'Meghalaya',
       'Dadra and Nagar Haveli and Daman and Diu',
       'Cases being reassigned to states', 'Sikkim', 'Daman & Diu',
       'Lakshadweep', 'Telangana', 'Dadra and Nagar Haveli', 'Bihar****',
       'Madhya Pradesh***', 'Himanchal Pradesh', 'Karanataka',
       'Maharashtra***'], dtype=object)

In [8]:
# Dropping redundant data

covid_data.drop(covid_data[covid_data['State'] == 'Cases being reassigned to states'].index,inplace = True)
covid_data.drop(covid_data[covid_data['State'] == 'Bihar****'].index,inplace = True)
covid_data.drop(covid_data[covid_data['State'] == 'Telengana'].index,inplace = True)
covid_data.drop(covid_data[covid_data['State'] == 'Madhya Pradesh***'].index,inplace = True)
covid_data.drop(covid_data[covid_data['State'] == 'Karanataka'].index,inplace = True)
covid_data.drop(covid_data[covid_data['State'] == 'Maharashtra***'].index,inplace = True)
covid_data.drop(covid_data[covid_data['State'] == 'Unassigned'].index,inplace = True)
covid_data.drop(covid_data[covid_data['State'] == 'Dadra and Nagar Haveli and Daman and Diu'].index,inplace = True)

In [9]:
# Dropping unnecessary columns if needed
covid_data = covid_data.drop(['ConfirmedIndianNational','ConfirmedForeignNational','Sno', 'Time'], axis=1)

In [10]:
# Converting 'Date' column to datetime format
covid_data['Date'] = pd.to_datetime(covid_data['Date'], format='%Y-%m-%d')

In [None]:
# Checking number of rows and columns
covid_data.shape

(17155, 5)

In [None]:
covid_data.tail(5) #Recent Cases

Unnamed: 0,Date,State,Cured,Deaths,Confirmed
18105,2021-08-11,Telangana,638410,3831,650353
18106,2021-08-11,Tripura,77811,773,80660
18107,2021-08-11,Uttarakhand,334650,7368,342462
18108,2021-08-11,Uttar Pradesh,1685492,22775,1708812
18109,2021-08-11,West Bengal,1506532,18252,1534999


In [None]:
 #checking for null values
 covid_data.isnull().sum()

Date         0
State        0
Cured        0
Deaths       0
Confirmed    0
dtype: int64

In [11]:
# Generating statewise details
state_cases = covid_data.groupby('State')['Confirmed','Deaths','Cured'].max().reset_index()

state_cases['Active'] = state_cases['Confirmed'] - (state_cases['Deaths']+state_cases['Cured'])
state_cases["Death Rate (per 100)"] = np.round(100*state_cases["Deaths"]/state_cases["Confirmed"],2)
state_cases["Cure Rate (per 100)"] = np.round(100*state_cases["Cured"]/state_cases["Confirmed"],2)

state_cases.style.background_gradient(cmap='Blues',subset=["Confirmed"])\
                        .background_gradient(cmap='Blues',subset=["Deaths"])\
                        .background_gradient(cmap='Blues',subset=["Cured"])\
                        .background_gradient(cmap='Blues',subset=["Active"])\
                        .background_gradient(cmap='Blues',subset=["Death Rate (per 100)"])\
                        .background_gradient(cmap='Blues',subset=["Cure Rate (per 100)"])


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,State,Confirmed,Deaths,Cured,Active,Death Rate (per 100),Cure Rate (per 100)
0,Andaman and Nicobar Islands,7548,129,7412,7,1.71,98.2
1,Andhra Pradesh,1985182,13564,1952736,18882,0.68,98.37
2,Arunachal Pradesh,50605,248,47821,2536,0.49,94.5
3,Assam,576149,5420,559684,11045,0.94,97.14
4,Bihar,725279,9646,715352,281,1.33,98.63
5,Chandigarh,61992,811,61150,31,1.31,98.64
6,Chhattisgarh,1003356,13544,988189,1623,1.35,98.49
7,Dadra and Nagar Haveli,10377,4,10261,112,0.04,98.88
8,Daman & Diu,2,0,0,2,0.0,0.0
9,Delhi,1436852,25068,1411280,504,1.74,98.22


In [None]:
# Grouping data by Date to get overall statistics for each day
datewise_data = covid_data.groupby('Date').agg({
    'Confirmed': 'sum',
    'Cured': 'sum',
    'Deaths': 'sum'
}).reset_index()

# Visualizing the trends over time using Plotly
fig = px.line(datewise_data, x='Date', y=['Confirmed', 'Cured', 'Deaths'],
              labels={'value': 'Count', 'variable': 'Cases', 'Date': 'Date'},
              title='Trends of COVID-19 Cases, Recoveries, and Deaths Over Time')
fig.show()

In [13]:
# Grouping data by State/UnionTerritory to get overall statistics for each state
statewise_data = covid_data.groupby('State')['Confirmed', 'Cured', 'Deaths'].max().reset_index()

# Creating new columns 'Active', 'Death rate per 100', and 'Cure rate per 100'
statewise_data['Active'] = statewise_data['Confirmed'] - (statewise_data['Cured'] + statewise_data['Deaths'])
statewise_data['Death rate per 100'] = (statewise_data['Deaths'] / statewise_data['Confirmed']) * 100
statewise_data['Cure rate per 100'] = (statewise_data['Cured'] / statewise_data['Confirmed']) * 100

# Sorting by maximum Confirmed Cases
statewise_data = statewise_data.sort_values(by='Confirmed', ascending=False)

# Visualizing the aggregated data using Plotly
fig = px.bar(statewise_data, x='State', y=['Confirmed', 'Deaths', 'Cured', 'Active'],
             title='COVID-19 Cases by State/Union Territory',
             labels={'value': 'Count', 'variable': 'Cases', 'State': 'State/Union Territory'},
             barmode='group')
fig.show()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [14]:
statewise_data['Confirmed percentage'] = (statewise_data['Confirmed']*100/statewise_data.sum()['Confirmed'])

fig1 = px.pie(statewise_data, values='Confirmed',
              hover_name='State',
              labels={'State': 'State', 'Confirmed': 'Confirmed Cases'},
              hole=0.65, color='Confirmed',
              template='ggplot2',
              color_discrete_sequence=px.colors.sequential.Mint_r,
              title="State-wise Distribution of Confirmed Covid Cases (as of " + str(covid_data['Date'].max()) + ")")

fig1.update_traces(textposition='inside', textinfo='percent')

fig1.show()

In [None]:
# Comparing the trends of COVID-19 cases between different states
fig_trend_comparison = px.line(covid_data, x='Date', y='Confirmed', color='State',
                               title='Comparison of COVID-19 Cases between States',
                               labels={'Confirmed': 'Cases', 'Date': 'Date', 'State': 'State/Region'})
fig_trend_comparison.show()

In [None]:
# State latitudes and longitudes found using geopy
from geopy.geocoders import Nominatim

# Create a geolocator object
geolocator = Nominatim(user_agent="my_app")

# List of states from your dataset
states = [
    'Kerala', 'Delhi', 'Rajasthan', 'Uttar Pradesh',
    'Haryana', 'Ladakh', 'Tamil Nadu', 'Karnataka', 'Maharashtra',
    'Punjab', 'Jammu and Kashmir', 'Andhra Pradesh', 'Uttarakhand',
    'Odisha', 'Puducherry', 'West Bengal', 'Chhattisgarh',
    'Chandigarh', 'Gujarat', 'Himachal Pradesh', 'Madhya Pradesh',
    'Bihar', 'Manipur', 'Mizoram', 'Andaman and Nicobar Islands',
    'Goa', 'Assam', 'Jharkhand', 'Arunachal Pradesh',
    'Tripura', 'Nagaland', 'Meghalaya', 'Sikkim', 'Daman & Diu',
    'Lakshadweep', 'Telangana', 'Dadra and Nagar Haveli', 'Himanchal Pradesh'
]

# Dictionary to store state names and their corresponding coordinates
state_coordinates = {}

# Get coordinates for each state
for state in states:
    location = geolocator.geocode(state)
    if location:
        state_coordinates[state] = (location.latitude, location.longitude)
    else:
        state_coordinates[state] = None

# Display the state names and their coordinates
for state, coordinates in state_coordinates.items():
    print(f"{state}: {coordinates}")

Kerala: (10.3528744, 76.5120396)
Delhi: (28.6273928, 77.1716954)
Rajasthan: (26.8105777, 73.7684549)
Uttar Pradesh: (27.1303344, 80.859666)
Haryana: (29.0, 76.0)
Ladakh: (33.9456407, 77.6568576)
Tamil Nadu: (10.9094334, 78.3665347)
Karnataka: (14.5203896, 75.7223521)
Maharashtra: (18.9068356, 75.6741579)
Punjab: (30.9293211, 75.5004841)
Jammu and Kashmir: (33.6649297, 75.1629584)
Andhra Pradesh: (15.9240905, 80.1863809)
Uttarakhand: (30.0417376, 79.089691)
Odisha: (20.5431241, 84.6897321)
Puducherry: (10.91564885, 79.80694879844232)
West Bengal: (22.9964948, 87.6855882)
Chhattisgarh: (21.6637359, 81.8406351)
Chandigarh: (30.7334421, 76.7797143)
Gujarat: (22.3850051, 71.745261)
Himachal Pradesh: (31.81676015, 77.34932051968858)
Madhya Pradesh: (23.8143419, 77.5340719)
Bihar: (25.6440845, 85.906508)
Manipur: (24.7208818, 93.9229386)
Mizoram: (23.2146169, 92.8687612)
Andaman and Nicobar Islands: (12.61123865, 92.83165406414926)
Goa: (15.3004543, 74.0855134)
Assam: (26.4073841, 93.2551303)

In [None]:
Location_dict = {'Andaman and Nicobar Islands': (10.2188344, 92.5771329),
 'Andhra Pradesh': (15.9240905, 80.1863809),
 'Arunachal Pradesh': (27.6891712, 96.4597226),
 'Assam': (26.4073841, 93.2551303),
 'Bihar': (25.6440845, 85.906508),
 'Chandigarh': (30.7334421, 76.7797143),
 'Chhattisgarh': (21.6637359, 81.8406351),
 'Daman & Diu': (20.42000485, 72.86376290300566),
 'Delhi': (28.6517178, 77.2219388),
 'Goa': (15.3004543, 74.0855134),
 'Gujarat': (22.41540825, 72.03149703699282),
 'Haryana': (29.0, 76.0),
 'Himachal Pradesh': (31.81676015, 77.34932051968858),
 'Jammu and Kashmir': (33.5574473, 75.06152),
 'Jharkhand': (23.4559809, 85.2557301),
 'Karnataka': (14.5203896, 75.7223521),
 'Kerala': (10.3528744, 76.5120396),
 'Ladakh': (33.9456407, 77.6568576),
 'Madhya Pradesh': (23.9699282, 79.39486954625225),
 'Maharashtra': (19.531932, 76.0554568),
 'Manipur': (24.7208818, 93.9229386),
 'Meghalaya': (25.5379432, 91.2999102),
 'Mizoram': (23.2146169, 92.8687612),
 'Nagaland': (26.1630556, 94.5884911),
 'Odisha': (20.5431241, 84.6897321),
 'Puducherry': (11.9340568, 79.8306447),
 'Punjab': (30.9293211, 75.5004841),
 'Rajasthan': (26.8105777, 73.7684549),
 'Sikkim': (27.601029, 88.45413638680145),
 'Tamil Nadu': (10.9094334, 78.3665347),
 'Telangana': (17.8495919, 79.1151663),
 'Telengana': (17.329125, 78.5822228),
 'Tripura': (23.7750823, 91.7025091),
 'Uttar Pradesh': (27.1303344, 80.859666),
 'Uttarakhand': (30.091993549999998, 79.32176659343018),
 'West Bengal': (22.9964948, 87.6855882)}

In [None]:

state_report = covid_data.groupby('State')['Confirmed', 'Cured', 'Deaths'].max().reset_index()

# Filter data and reset index
map_states = state_report.drop(index=[5, 8, 9, 36], errors='ignore')
map_states.reset_index(inplace=True, drop=True)

# Assign state coordinates from Location_dict to DataFrame
map_states['Latitude'] = map_states['State'].map(lambda x: Location_dict.get(x, [None, None])[0])
map_states['Longitude'] = map_states['State'].map(lambda x: Location_dict.get(x, [None, None])[1])

# Create a Choropleth map using Plotly Express
fig = px.scatter_geo(map_states.dropna(),  # Remove rows with missing coordinates
                     lat='Latitude',
                     lon='Longitude',
                     hover_name='State',
                     size='Confirmed',  # Size represents the 'Confirmed' cases
                     color='Confirmed',  # Color also represents the 'Confirmed' cases
                     projection='natural earth',
                     title='COVID-19 Statistics by State/Union Territory in India',
                     labels={'Confirmed': 'Confirmed Cases'}
                     )

# Add 'Deaths' and 'Cured' as additional variables in hover information
fig.update_traces(hovertemplate='<b>%{hovertext}</b><br>' +
                                'Confirmed: %{marker.size}<br>' +
                                'Deaths: %{customdata[0]}<br>' +
                                'Cured: %{customdata[1]}')

# Set 'Deaths' and 'Cured' as custom data for each point
fig.data[0].update(customdata=map_states[['Deaths', 'Cured']].values.tolist())

fig.show()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



We can deduce that the worst-struck states of India were the ones with densely packed urban areas, like Mumbai in Maharashtra, Chennai in Tamil Nadu and New Delhi.



In [None]:
# Hierarchy and proportion of total deaths in various states using a treemap
fig_treemap_deaths = px.treemap(total_deaths_statewise, path=['State'], values='Deaths',
                                 title='Total Deaths Treemap in India by State')
fig_treemap_deaths.show()


In [None]:

state_deaths = state_cases.sort_values(by="Deaths", ascending=False).head(20)

fig = px.bar(state_deaths, x='State', y='Deaths', orientation='v',
             labels={'Deaths': 'Deaths', 'State': 'State', 'Death Rate (per 100)': 'Death Rate (per 100)'},
             color='Death Rate (per 100)', color_continuous_scale=px.colors.sequential.Reds,
             hover_name='State',
             hover_data={'Deaths': True, 'Death Rate (per 100)': True, 'State': False},
             text='Death Rate (per 100)',
             title="India's Deadliest States with Deaths per 100 Cases")

fig.update_layout(title={'x': 0.5})
fig.update_traces(textposition='outside')
fig.show()

Punjab comes in as the deadliest state with 3 people dead for every 100 infected, followed by Uttarakhand, Maharashtra and Delhi. Karnataka and Tamil Nadu has quite a low ratio considering its large number of infected.


<div style="color:white;
           padding:8px 10px 0 10px;
           display:inline-block;
           border-radius:5px;
           background-color:#5642C5;
           font-size:110%;
           font-family:Verdana">
    <h1 style='color:white;'> World Data</h1>
</div>

In [None]:
# Displaying the first few rows of the dataset
world_data.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active,WHO Region
0,,Afghanistan,33.93911,67.709953,2020-01-22,0,0,0,0.0,Eastern Mediterranean
1,,Albania,41.1533,20.1683,2020-01-22,0,0,0,0.0,Europe
2,,Algeria,28.0339,1.6596,2020-01-22,0,0,0,0.0,Africa
3,,Andorra,42.5063,1.5218,2020-01-22,0,0,0,0.0,Europe
4,,Angola,-11.2027,17.8739,2020-01-22,0,0,0,0.0,Africa


In [52]:
# Remove the 'State' column if it's deemed unimportant or irrelevant
world_data = world_data.drop(columns='State')

In [53]:
# Summary statistics of numerical columns
world_data.describe()

Unnamed: 0,Lat,Long,Confirmed,Deaths,Recovered,Active
count,49068.0,49068.0,49068.0,49068.0,49068.0,49068.0
mean,21.43373,23.528236,16884.9,884.17916,7915.713,8085.012
std,24.95032,70.44274,127300.2,6313.584411,54800.92,76258.9
min,-51.7963,-135.0,0.0,0.0,0.0,-14.0
25%,7.873054,-15.3101,4.0,0.0,0.0,0.0
50%,23.6345,21.7453,168.0,2.0,29.0,26.0
75%,41.20438,80.771797,1518.25,30.0,666.0,606.0
max,71.7069,178.065,4290259.0,148011.0,1846641.0,2816444.0


Confirmed Cases:
The mean confirmed cases across regions are around 5,276 with a standard deviation of approximately 46,228. The maximum confirmed cases observed were about 1.61 million, while the minimum was 0.

Deaths:
The average number of deaths reported is approximately 351, with a standard deviation of around 3,205. The highest number of deaths recorded is 97,025, and the lowest is 0.

Recovered:
The mean recovered cases are around 1,608, with a standard deviation of approximately 11,396. The maximum recovered cases observed were about 350,135, while the minimum was 0.

Active Cases:
The mean number of active cases is approximately 3,317, with a standard deviation of around 34,864. The highest number of active cases recorded was about 1.19 million, and the lowest went below zero (which might need to be investigated further as active cases cannot be negative).

In [54]:
# Checking information about the DataFrame's structure
world_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49068 entries, 0 to 49067
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Country     49068 non-null  object 
 1   Lat         49068 non-null  float64
 2   Long        49068 non-null  float64
 3   Date        49068 non-null  object 
 4   Confirmed   49068 non-null  int64  
 5   Deaths      49068 non-null  int64  
 6   Recovered   49068 non-null  int64  
 7   Active      49068 non-null  int64  
 8   WHO Region  49068 non-null  object 
dtypes: float64(2), int64(4), object(3)
memory usage: 3.4+ MB


In [55]:
# Checking for missing values
print(world_data.isnull().sum())

Country       0
Lat           0
Long          0
Date          0
Confirmed     0
Deaths        0
Recovered     0
Active        0
WHO Region    0
dtype: int64


In [56]:
# Convert 'Date' column to datetime format
world_data['Date'] = pd.to_datetime(world_data['Date'], dayfirst=True)

In [57]:
# Convert 'Date' column back to string format 'YYYY/MM/DD'
world_data['Date'] = world_data['Date'].dt.strftime('%Y/%m/%d')

In [59]:
# Renaming columns for clarity
world_data.rename(columns={
    'Country/Region': 'Country'
}, inplace=True)


In [60]:
# Checking the recent data
world_data.tail()

Unnamed: 0,Country,Lat,Long,Date,Confirmed,Deaths,Recovered,Active,WHO Region
49063,Sao Tome and Principe,0.1864,6.6131,2020/07/27,865,14,734,117,Africa
49064,Yemen,15.552727,48.516388,2020/07/27,1691,483,833,375,Eastern Mediterranean
49065,Comoros,-11.6455,43.3333,2020/07/27,354,7,328,19,Africa
49066,Tajikistan,38.861,71.2761,2020/07/27,7235,60,6028,1147,Europe
49067,Lesotho,-29.61,28.2336,2020/07/27,505,12,128,365,Africa


The dataframe contains data of over 5 months from 2020-01-22 till 2020/05/23

In [61]:
# Plotting for growth in total confirmed cases over time
confirmed = world_data.groupby('Date').sum()['Confirmed'].reset_index()

fig = px.bar(confirmed, x='Date', y='Confirmed',
             labels={'Date': 'Date', 'Confirmed': 'Confirmed Cases'},
             title='Total Confirmed Cases Over Time')
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Total Confirmed Cases')
fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [62]:
# trend of total deaths over time
deaths = world_data.groupby('Date').sum()['Deaths'].reset_index()

fig = px.bar(deaths, x='Date', y='Deaths',
             labels={'Date': 'Date', 'Deaths': 'Total Deaths'},
             title='Total Deaths Over Time')
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Total Deaths')
fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [63]:
# trend of total recoveries over time
recoveries = world_data.groupby('Date').sum()['Recovered'].reset_index()

fig = px.bar(recoveries, x='Date', y='Recovered',
             labels={'Date': 'Date', 'Recovered': 'Total Recovered'},
             title='Total Recoveries Over Time')
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Total Recovered')
fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



This trend signifies a continuous increase in COVID-19 cases, fatalities, and recoveries as time progresses.

In [21]:
# Geospatial Visualization - World Map

# Create a choropleth map to visualize total confirmed cases across different countries
fig = px.choropleth(world_data,
                    locations="Country",
                    locationmode='country names',
                    color="Confirmed",
                    hover_name="Country",
                    projection="natural earth",
                    title="COVID-19 Confirmed Cases Worldwide",
                    animation_frame="Date",
                    animation_group="Country"
                   )
fig.update_layout(coloraxis_colorbar=dict(title='Confirmed Cases'))
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 100
fig.show()

In [22]:
# Visualising the deaths worldwide
# Create a scatter plot on a world map
fig = px.scatter_geo(world_data, lat='Lat', lon='Long', color='Confirmed',
                     hover_name='Country', size='Deaths',
                     projection='natural earth',
                     title='COVID-19 Deaths Worldwide')
fig.show()


In [23]:
# Comparative Analysis - Line Plot for Top Countries
# Find top 5 countries with the highest total confirmed cases
top_countries_total = world_data.groupby('Country').agg({'Confirmed': 'max'}).reset_index()
top_countries_total = top_countries_total.sort_values(by='Confirmed', ascending=False).head(5)
top_countries_total = top_countries_total['Country'].tolist()

# Filter data for these top countries
top_countries_filtered = world_data[world_data['Country'].isin(top_countries_total)]

fig = px.line(top_countries_filtered, x='Date', y='Confirmed', color='Country',
              labels={'Confirmed': 'Confirmed Cases', 'Date': 'Date'},
              title='Top 5 Countries with Highest Total Confirmed Cases Over Time')
fig.show()

In [24]:
# Stacked Area Chart - Distribution of Cases
stacked_data = world_data.groupby('Date').sum().reset_index()

fig = px.area(stacked_data, x='Date', y=['Confirmed', 'Deaths', 'Recovered'],
              labels={'variable': 'Status', 'value': 'Count', 'Date': 'Date'},
              title='Stacked Area Chart for COVID-19 Cases')
fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [64]:
# Mortality and Recovery Rates - Line Plot
mortality_recovery = world_data.groupby('Date').sum().reset_index()
mortality_recovery['Mortality Rate'] = (mortality_recovery['Deaths'] / mortality_recovery['Confirmed']) * 100
mortality_recovery['Recovery Rate'] = (mortality_recovery['Recovered'] / mortality_recovery['Confirmed']) * 100

fig = px.line(mortality_recovery, x='Date', y=['Mortality Rate', 'Recovery Rate'],
              labels={'value': 'Rate', 'Date': 'Date'},
              title='Mortality and Recovery Rates Over Time')
fig.show()


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



# Create Prophet Model and Make Predictions:

In [26]:
from prophet import Prophet

In [27]:
# Load and Prepare COVID-19 Time Series Data

# Preprocess the data
covid_data_prophet = covid_data[['Date', 'Confirmed']]

# Convert the 'Date' column to datetime format
covid_data_prophet['Date'] = pd.to_datetime(covid_data_prophet['Date'])

# Prepare the data for the Prophet model
covid_data_prophet.rename(columns={'Confirmed': 'y', 'Date': 'ds'}, inplace=True)

# Display the first few rows of the prepared data
covid_data_prophet.head()

Unnamed: 0,ds,y
0,2020-01-30,1
1,2020-01-31,1
2,2020-02-01,2
3,2020-02-02,3
4,2020-02-03,3


In [28]:
# Create Prophet Model and Make Predictions
import plotly.graph_objs as go

# Create a Prophet model
model = Prophet()

# Fit the model to the COVID-19 data
model.fit(covid_data_prophet)

# Make future predictions
max_date = covid_data_prophet['ds'].max()  # Get the maximum date from the 'ds' column
future_dates = pd.DataFrame({'ds': pd.date_range(start=max_date, periods=30, freq='D')})

# Forecast future confirmed cases
forecasted_data = model.predict(future_dates)

# Display the forecasted data
forecasted_data

INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpwz_5wbvs/xuoribre.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpwz_5wbvs/4c7qb4vc.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=35733', 'data', 'file=/tmp/tmpwz_5wbvs/xuoribre.json', 'init=/tmp/tmpwz_5wbvs/4c7qb4vc.json', 'output', 'file=/tmp/tmpwz_5wbvs/prophet_modelp__r0cgd/prophet_model-20231206105144.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
10:51:44 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
10:51:47 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2021-08-11,999808.9,177091.659729,1760910.0,999808.9,999808.9,-2341.766197,-2341.766197,-2341.766197,-2341.766197,-2341.766197,-2341.766197,0.0,0.0,0.0,997467.1
1,2021-08-12,1003782.0,265274.10846,1772166.0,1003782.0,1003782.0,1018.626705,1018.626705,1018.626705,1018.626705,1018.626705,1018.626705,0.0,0.0,0.0,1004801.0
2,2021-08-13,1007755.0,264070.66925,1763398.0,1007755.0,1007755.0,551.978697,551.978697,551.978697,551.978697,551.978697,551.978697,0.0,0.0,0.0,1008307.0
3,2021-08-14,1011728.0,211697.155425,1812970.0,1011728.0,1011728.0,481.076183,481.076183,481.076183,481.076183,481.076183,481.076183,0.0,0.0,0.0,1012209.0
4,2021-08-15,1015701.0,281583.439036,1736181.0,1015666.0,1015701.0,603.366983,603.366983,603.366983,603.366983,603.366983,603.366983,0.0,0.0,0.0,1016304.0
5,2021-08-16,1019674.0,286957.557006,1788933.0,1019503.0,1019674.0,427.60608,427.60608,427.60608,427.60608,427.60608,427.60608,0.0,0.0,0.0,1020102.0
6,2021-08-17,1023647.0,303887.705912,1790220.0,1023329.0,1023840.0,-740.888452,-740.888452,-740.888452,-740.888452,-740.888452,-740.888452,0.0,0.0,0.0,1022906.0
7,2021-08-18,1027620.0,202649.530198,1806405.0,1027084.0,1028025.0,-2341.766197,-2341.766197,-2341.766197,-2341.766197,-2341.766197,-2341.766197,0.0,0.0,0.0,1025278.0
8,2021-08-19,1031593.0,262549.03804,1857195.0,1030819.0,1032280.0,1018.626705,1018.626705,1018.626705,1018.626705,1018.626705,1018.626705,0.0,0.0,0.0,1032612.0
9,2021-08-20,1035566.0,239398.19179,1811036.0,1034461.0,1036604.0,551.978697,551.978697,551.978697,551.978697,551.978697,551.978697,0.0,0.0,0.0,1036118.0


In [29]:
# Combine the actual and forecasted data
combined_data = pd.concat([covid_data_prophet, forecasted_data[['ds', 'yhat']]], sort=False)

# Create traces for actual and forecasted data
trace_actual = go.Scatter(x=combined_data['ds'], y=combined_data['y'], mode='lines', name='Actual')
trace_forecast = go.Scatter(x=combined_data['ds'], y=combined_data['yhat'], mode='lines', name='Forecast')

# Create layout for the plot
layout = go.Layout(
    title='Actual vs Forecasted Confirmed Cases',
    xaxis=dict(title='Date'),
    yaxis=dict(title='Confirmed Cases')
)

# Create figure and add traces to it
fig = go.Figure(data=[trace_actual, trace_forecast], layout=layout)

# Display the figure
fig.show()

Upon analysis, our model forecasted a projection indicating an estimated total of 1.11M confirmed cases in India by the date of September 9, 2021, under the assumption of a continued trajectory in the increase of confirmed cases. This prediction is based on historical data trends and the model's forecasting capabilities.


In [30]:
# Forecasting deaths

# Preprocess the data
covid_deaths_prophet = covid_data[['Date', 'Deaths']]

# Convert the 'Date' column to datetime format
covid_deaths_prophet['Date'] = pd.to_datetime(covid_deaths_prophet['Date'])

# Prepare the data for the Prophet model
covid_deaths_prophet.rename(columns={'Deaths': 'y', 'Date': 'ds'}, inplace=True)

In [31]:
# Create a Prophet model for deaths
model_deaths = Prophet()

# Fit the model to the COVID-19 deaths data
model_deaths.fit(covid_deaths_prophet)

# Make future predictions for deaths
future_dates_deaths = model_deaths.make_future_dataframe(periods=30)

# Forecast future deaths
forecasted_deaths = model_deaths.predict(future_dates_deaths)

# Combine the actual and forecasted deaths data
combined_deaths = pd.concat([covid_deaths_prophet, forecasted_deaths[['ds', 'yhat']]], sort=False)

# Create traces for actual and forecasted deaths
trace_actual_deaths = go.Scatter(x=combined_deaths['ds'], y=combined_deaths['y'], mode='lines', name='Actual Deaths')
trace_forecasted_deaths = go.Scatter(x=combined_deaths['ds'], y=combined_deaths['yhat'], mode='lines', name='Forecasted Deaths')

# Create layout for the plot
layout_deaths = go.Layout(
    title='Actual vs Forecasted Deaths',
    xaxis=dict(title='Date'),
    yaxis=dict(title='Deaths')
)

# Create figure and add traces to it
fig_deaths = go.Figure(data=[trace_actual_deaths, trace_forecasted_deaths], layout=layout_deaths)

# Display the figure
fig_deaths.show()

INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpwz_5wbvs/lywewn80.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpwz_5wbvs/x3wdvcsj.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=57857', 'data', 'file=/tmp/tmpwz_5wbvs/lywewn80.json', 'init=/tmp/tmpwz_5wbvs/x3wdvcsj.json', 'output', 'file=/tmp/tmpwz_5wbvs/prophet_model2hm3r1ry/prophet_model-20231206105230.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
10:52:30 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
10:52:32 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


Upon analysis, our model forecasted a projection indicating an estimated total of 15.4K death cases in India by the date of September 9, 2021, under the assumption of a continued trajectory in the increase of deaths. This prediction is based on historical data trends and the model's forecasting capabilities.

#FORECASTING WORLD COVID DATA

In [65]:
# Grouping data by 'Date' and aggregating 'Confirmed' cases
world_grouped = world_data.groupby('Date')['Confirmed'].sum().reset_index()

# Renaming columns for Prophet compatibility
world_grouped.columns = ['ds', 'y']
world_grouped['ds'] = pd.to_datetime(world_grouped['ds'])

In [66]:
# Create a Prophet model for confirmed cases
model_confirmed = Prophet()
model_confirmed.fit(world_grouped)

# Generate future dates for prediction (adjust periods as needed)
future_dates_confirmed = model_confirmed.make_future_dataframe(periods=30)

# Forecast future confirmed cases
forecast_confirmed = model_confirmed.predict(future_dates_confirmed)

INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpwz_5wbvs/cprctvg3.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpwz_5wbvs/kbtrvqi8.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=4397', 'data', 'file=/tmp/tmpwz_5wbvs/cprctvg3.json', 'init=/tmp/tmpwz_5wbvs/kbtrvqi8.json', 'output', 'file=/tmp/tmpwz_5wbvs/prophet_model6iuf7z_8/prophet_model-20231206112216.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
11:22:16 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
11:22:16 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


In [67]:
# Create a Plotly figure for the forecasted data
fig = go.Figure()

# Add actual confirmed cases data
fig.add_trace(go.Scatter(x=world_grouped['ds'], y=world_grouped['y'], mode='lines', name='Actual Cases'))

# Add forecasted confirmed cases data
fig.add_trace(go.Scatter(x=forecast_confirmed['ds'], y=forecast_confirmed['yhat'], mode='lines', name='Forecast'))

# Update layout and show plot
fig.update_layout(title='Forecasted Confirmed Cases',
                  xaxis_title='Date',
                  yaxis_title='Confirmed Cases')
fig.show()

Upon analysis, our model forecasted a projection indicating an estimated total of 22.124M confirmed cases in the world by the date of Aug 26, 2021, under the assumption of a continued trajectory in the increase of confirmed cases. This prediction is based on historical data trends and the model's forecasting capabilities.

In [68]:
# Grouping data by 'Date' and aggregating 'Deaths' cases
world_deaths = world_data.groupby('Date')['Deaths'].sum().reset_index()

# Renaming columns for Prophet compatibility
world_deaths.columns = ['ds', 'y']
world_deaths['ds'] = pd.to_datetime(world_deaths['ds'])  # Convert 'ds' to datetime if not already in datetime format

# Create a Prophet model for deaths
model_deaths = Prophet()
model_deaths.fit(world_deaths)

# Generate future dates for death predictions (adjust periods as needed)
future_dates_deaths = model_deaths.make_future_dataframe(periods=30)

# Forecast future deaths
forecast_deaths = model_deaths.predict(future_dates_deaths)

# Plotting the forecasted deaths
fig_deaths = go.Figure()
fig_deaths.add_trace(go.Scatter(x=world_deaths['ds'], y=world_deaths['y'], mode='lines', name='Actual Deaths'))
fig_deaths.add_trace(go.Scatter(x=forecast_deaths['ds'], y=forecast_deaths['yhat'], mode='lines', name='Forecast'))
fig_deaths.update_layout(title='Forecasted Deaths', xaxis_title='Date', yaxis_title='Deaths')
fig_deaths.show()

INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpwz_5wbvs/tfk5640c.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpwz_5wbvs/ntjz5iij.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=34287', 'data', 'file=/tmp/tmpwz_5wbvs/tfk5640c.json', 'init=/tmp/tmpwz_5wbvs/ntjz5iij.json', 'output', 'file=/tmp/tmpwz_5wbvs/prophet_modelwzswlzga/prophet_model-20231206113117.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
11:31:17 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
11:31:17 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


Upon analysis, our model forecasted a projection indicating an estimated total of 798.7k death cases in the world by the date of Aug 26, 2021, under the assumption of a continued trajectory in the increase of death cases. This prediction is based on historical data trends and the model's forecasting capabilities.

In [69]:
# Grouping data by 'Date' and aggregating 'Recovered' cases
world_recovered = world_data.groupby('Date')['Recovered'].sum().reset_index()

# Renaming columns for Prophet compatibility
world_recovered.columns = ['ds', 'y']
world_recovered['ds'] = pd.to_datetime(world_recovered['ds'])  # Convert 'ds' to datetime if not already in datetime format

# Create a Prophet model for recoveries
model_recovered = Prophet()
model_recovered.fit(world_recovered)

# Generate future dates for recovery predictions (adjust periods as needed)
future_dates_recovered = model_recovered.make_future_dataframe(periods=30)

# Forecast future recoveries
forecast_recovered = model_recovered.predict(future_dates_recovered)

# Plotting the forecasted recoveries
fig_recovered = go.Figure()
fig_recovered.add_trace(go.Scatter(x=world_recovered['ds'], y=world_recovered['y'], mode='lines', name='Actual Recoveries'))
fig_recovered.add_trace(go.Scatter(x=forecast_recovered['ds'], y=forecast_recovered['yhat'], mode='lines', name='Forecast'))
fig_recovered.update_layout(title='Forecasted Recoveries', xaxis_title='Date', yaxis_title='Recoveries')
fig_recovered.show()


INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpwz_5wbvs/_z6h9_ql.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpwz_5wbvs/z1ihnjfo.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=94669', 'data', 'file=/tmp/tmpwz_5wbvs/_z6h9_ql.json', 'init=/tmp/tmpwz_5wbvs/z1ihnjfo.json', 'output', 'file=/tmp/tmpwz_5wbvs/prophet_model9e6tj7lm/prophet_model-20231206113144.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
11:31:44 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
11:31:44 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


Upon analysis, our model forecasted a projection indicating an estimated total of 13.28M recovered cases in the world by the date of Aug 26, 2021, under the assumption of a continued trajectory in the increase of recovered cases. This prediction is based on historical data trends and the model's forecasting capabilities.