In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [12]:
df = pd.read_csv("covid_data.csv")
df.date = pd.to_datetime(df.date)

In [13]:
df.head()

Unnamed: 0,location,date,new_cases,new_deaths,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
0,Afghanistan,2019-12-31,0,0,38928341.0,2.581,1803.987,0.5
1,Afghanistan,2020-01-01,0,0,38928341.0,2.581,1803.987,0.5
2,Afghanistan,2020-01-02,0,0,38928341.0,2.581,1803.987,0.5
3,Afghanistan,2020-01-03,0,0,38928341.0,2.581,1803.987,0.5
4,Afghanistan,2020-01-04,0,0,38928341.0,2.581,1803.987,0.5


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23082 entries, 0 to 23081
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   location                    23082 non-null  object        
 1   date                        23082 non-null  datetime64[ns]
 2   new_cases                   23082 non-null  int64         
 3   new_deaths                  23082 non-null  int64         
 4   population                  23018 non-null  float64       
 5   aged_65_older_percent       20643 non-null  float64       
 6   gdp_per_capita              20711 non-null  float64       
 7   hospital_beds_per_thousand  19144 non-null  float64       
dtypes: datetime64[ns](1), float64(4), int64(2), object(1)
memory usage: 1.4+ MB


In [15]:
df.describe(include="all")

Unnamed: 0,location,date,new_cases,new_deaths,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
count,23082,23082,23082.0,23082.0,23018.0,20643.0,20711.0,19144.0
unique,212,,,,,,,
top,Germany,,,,,,,
freq,163,,,,,,,
mean,,2020-04-10 02:09:57.036652032,624.769257,35.629062,103700300.0,9.75857,22708.384791,3.204687
min,,2019-12-31 00:00:00,-2461.0,-1918.0,809.0,1.144,661.24,0.1
25%,,2020-03-18 00:00:00,0.0,0.0,2083380.0,3.853,6426.674,1.4
50%,,2020-04-16 00:00:00,3.0,0.0,9449321.0,7.646,15524.995,2.6
75%,,2020-05-14 00:00:00,55.0,1.0,33469200.0,15.322,35220.084,4.21
max,,2020-06-10 00:00:00,133510.0,10520.0,7794799000.0,27.049,116935.6,13.8


In [16]:
fig = px.line(df, 
              x='date', 
              y='new_cases', 
              color='location', 
              title='Covid')

# Show the figure
fig.show()

In [17]:
# Ensure the date column is in datetime format
df['date'] = pd.to_datetime(df['date'])

# Calculate the cumulative sum of new cases for each country
df['cumulative_cases'] = df.groupby('location')['new_cases'].cumsum()

# Pivot the DataFrame to have dates as rows and locations as columns for the cumulative cases
pivot_df = df.pivot(index='date', columns='location', values='cumulative_cases')

# Calculate the absolute difference in cumulative cases between Italy and Germany
pivot_df['cases_diff'] = (pivot_df['Italy'] - pivot_df['Germany']).abs()

# Find the first date where the difference is greater than 10,000
first_date_diff_over_10000 = pivot_df[pivot_df['cases_diff'] > 10000].index.min()

print(first_date_diff_over_10000)


2020-03-12 00:00:00


In [18]:
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit


# Filter for Italy and the specific date range
italy_df = df[(df['location'] == 'Italy') & 
              (df['date'] >= '2020-02-28') & 
              (df['date'] <= '2020-03-20')]

# Calculate cumulative cases for Italy within the date range
italy_df['cumulative_cases'] = italy_df['new_cases'].cumsum()

# Define the exponential model function
def exp_model(x, a, b):
    return a * np.exp(b * x)

# Prepare the data for curve fitting
x_data = np.arange(len(italy_df))
y_data = italy_df['cumulative_cases'].to_numpy()

# Fit the exponential model to the data
params, _ = curve_fit(exp_model, x_data, y_data, p0=(1, 0.1), maxfev=5000)

# Predict the cumulative cases on March 20, 2020
predicted_cases = exp_model(len(italy_df)-1, *params)

# Actual cumulative cases on March 20, 2020
actual_cases = y_data[-1]

# Calculate the difference between the predicted and actual cases
difference = predicted_cases - actual_cases

print(f"Predicted cumulative cases on 2020-03-20: {predicted_cases}")
print(f"Actual cumulative cases on 2020-03-20: {actual_cases}")
print(f"Difference: {difference}")


Predicted cumulative cases on 2020-03-20: 42346.66334162913
Actual cumulative cases on 2020-03-20: 40635
Difference: 1711.6633416291297




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [19]:
fig = px.line(df, 
              x='date', 
              y='new_deaths', 
              color='location', 
              title='Covid')

# Show the figure
fig.show()

In [20]:

# Group by country to sum up the total deaths and get the mean population for each country
country_deaths_population = df.groupby('location').agg(
    total_deaths=pd.NamedAgg(column='new_deaths', aggfunc='sum'),
    population=pd.NamedAgg(column='population', aggfunc='mean')  # Assuming population doesn't change
).reset_index()

# Calculate death rate per million inhabitants
country_deaths_population['death_rate_per_million'] = (country_deaths_population['total_deaths'] / country_deaths_population['population']) * 1e6

# Sort by death rate in descending order
country_deaths_population_sorted = country_deaths_population.sort_values(by='death_rate_per_million', ascending=False)

# Get the country with the 3rd highest death rate
third_highest_death_rate_country = country_deaths_population_sorted.iloc[2]

print(third_highest_death_rate_country)



location                     Andorra
total_deaths                      51
population                   77265.0
death_rate_per_million    660.066007
Name: 3, dtype: object


In [22]:
import pandas as pd

# Ensure necessary columns have no missing values
df_filtered = df.dropna(subset=['aged_65_older_percent', 'population', 'new_deaths'])

# Group by location and calculate necessary statistics
country_stats = df_filtered.groupby('location').agg(
    total_deaths=pd.NamedAgg(column='new_deaths', aggfunc='sum'),
    population=pd.NamedAgg(column='population', aggfunc='mean'),
    aged_65_older_percent=pd.NamedAgg(column='aged_65_older_percent', aggfunc='mean')
).reset_index()

# Calculate death rate per million
country_stats['death_rate_per_million'] = (country_stats['total_deaths'] / country_stats['population']) * 1e6

# Define actual and predicted conditions based on the statement
predicted_positive = country_stats['aged_65_older_percent'] > 20
actual_positive = country_stats['death_rate_per_million'] > 50
predicted_negative = ~predicted_positive
actual_negative = ~actual_positive

# Calculate True Positives, False Positives, True Negatives, False Negatives
TP = (predicted_positive & actual_positive).sum()
FP = (predicted_positive & actual_negative).sum()
TN = (predicted_negative & actual_negative).sum()
FN = (predicted_negative & actual_positive).sum()

# Calculate Precision, Recall, and F1 Score
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")


Precision: 0.5714285714285714
Recall: 0.10810810810810811
F1 Score: 0.18181818181818182


In [23]:
# Proportion of countries with GDP over $10,000
total_countries = len(df)
countries_with_high_gdp = len(df[df['gdp_per_capita'] > 10000])
P_A = countries_with_high_gdp / total_countries

# Proportion of countries with at least 5 hospital beds per 1000 inhabitants among countries with GDP over $10,000
countries_with_high_gdp_and_beds = len(df[(df['gdp_per_capita'] > 10000) & (df['hospital_beds_per_thousand'] >= 5)])
P_B_given_A = countries_with_high_gdp_and_beds / countries_with_high_gdp

# Proportion of countries with at least 5 hospital beds per 1000 inhabitants
total_countries_with_beds = len(df[df['hospital_beds_per_thousand'] >= 5])
P_B = total_countries_with_beds / total_countries

# Calculate P(A|B) using Bayes' Theorem
P_A_given_B = (P_B_given_A * P_A) / P_B

print("Probability that a country has GDP over $10,000 given they have at least 5 hospital beds per 1000 inhabitants:", P_A_given_B)


Probability that a country has GDP over $10,000 given they have at least 5 hospital beds per 1000 inhabitants: 0.8565304372677908
