In [2]:
import pandas as pd
import altair as alt

# First, a bit of data cleaning/processing : 

In [3]:
import country_converter as coco

# Create a dictionary that maps countries to continents
country_to_continent = {}
for country in coco.CountryConverter().data['name_short']:
    try:
        continent = coco.CountryConverter().convert(country, to='Continent')
        country_to_continent[country] = continent
    except:
        pass

# Some countries are missing in the dictnnary : here we add the missing countries
missing_region_data = {'Brunei': 'Asia',
'Burma': 'Asia',
'Congo': 'Africa',
'Democratic Republic of Congo': 'Africa',
'Czechia': 'Europe',
'Holy See': 'Europe',
'Kyrgyzstan': 'Asia',
'Saint Kitts and Nevis': 'North America',
'Saint Lucia': 'North America',
'Saint Vincent and the Grenadines': 'North America',
'Taiwan*': 'Asia',
'Turkey': 'Asia',
'US': 'America',
'West Bank and Gaza': 'Asia',
'Cape Verde' : 'Africa'}

for key in missing_region_data.keys() :
  country_to_continent[key] = missing_region_data[key]

In [4]:
all_countries = country_to_continent.keys()

# Data Preprocessing : deleting useless columns, deleting rows withmissing values (0.01%) and merging tables

def process_data(path, column_name) :

  table = pd.read_csv(path, parse_dates=['date'])

  table.drop(['World'], axis = 1, inplace = True)

  for country in table.columns[1:] :
    if country not in all_countries :
      table.drop(country, axis = 1, inplace = True)
  # Melt the table
  melted = pd.melt(table, id_vars=['date'], value_vars=table.columns[1:])

  # Rename columns
  melted = melted.rename(columns={'variable': 'Country'})

  # Sort by date
  melted = melted.sort_values(by='date').reset_index()

  melted.drop(melted.columns[0], axis = 1, inplace = True)
  melted = melted.dropna()
  melted = melted.rename(columns = {'date': 'Date', 'value': column_name})
  melted[column_name] = melted[column_name].apply(lambda x : int(x)) 
  return melted

new_cases = process_data('/content/new_cases.txt', 'New cases')
new_deaths = process_data('/content/new_deaths.txt', 'New deaths')

def merge_tables(new_cases, new_deaths):
  merged = pd.merge(new_cases, new_deaths, on=['Country', 'Date'], how='inner')
  return merged

full_clean_data = merge_tables(new_cases, new_deaths)

In [5]:
# Adding a 'Continent' column
full_clean_data['Continent'] = full_clean_data['Country'].map(country_to_continent)

In [6]:
# We group together North America and Sount America to America

full_clean_data['Continent'] = full_clean_data['Continent'].apply(lambda x : 'America' if x == 'North America' else x)
full_clean_data['Continent'] = full_clean_data['Continent'].apply(lambda x : 'America' if x == 'South America' else x)

In [7]:
# We only keep data from 2020 :
start_date = pd.to_datetime('2020-01-03')
end_date = pd.to_datetime('2020-12-31')
full_clean_data = full_clean_data[(full_clean_data['Date'] >= start_date) & (full_clean_data['Date'] <= end_date)]


In [8]:
grouped_data = full_clean_data.groupby(['Continent', 'Date'], as_index=False)['New cases'].sum()

In [9]:
# We filter on certain countries (2 to 3 per continent) - to make data exploitable by Altair (5000 rows max)

countries = ['Nigeria',  'Ethiopia', 'Egypt', 
             'India', 'China',
             'France', 'Italy',
             'United States', 'Brazil', 
             'Australia', 'New Zealand']


# We map selected countries to their number of inhabitants in 2020

country_population = {'Nigeria': 206139587,
                      'Ethiopia': 114963588,
                      'Egypt': 102334404,
                      'Indonesia': 273523615,
                      'India': 1380004385,
                      'China': 1439323776,
                      'France': 65273511,
                      'Italy': 60461828,
                      'United States': 331002651,
                      'Mexico': 128932753,
                      'Brazil': 212559417,
                      'Australia': 25499884,
                      'New Zealand': 4822233}

# We map continent to their number of inhabitants in 2020

population_per_continent = {'Africa': 1.34e9,
                            'America' : 1.01e9,
                            'Asia': 4.64e9,
                            'Europe': 746e6,
                            'Oceania' : 43e6}

# We filter data on the listed counrties above
selected_data = full_clean_data[full_clean_data['Country'].isin(countries)]

# On the filtered data, we create a Ratio : Number of new cases and number of deaths for 100 000 inhabitants
selected_data['Cases_Ratio'] = selected_data['New cases'] / selected_data['Country'].map(country_population)*1e5
selected_data['Deaths_Ratio'] = selected_data['New deaths'] / selected_data['Country'].map(country_population)*1e5


selected_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data['Cases_Ratio'] = selected_data['New cases'] / selected_data['Country'].map(country_population)*1e5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data['Deaths_Ratio'] = selected_data['New deaths'] / selected_data['Country'].map(country_population)*1e5


Unnamed: 0,Date,Country,New cases,New deaths,Continent,Cases_Ratio,Deaths_Ratio
47,2020-01-03,India,0,0,Asia,0.000000,0.000000
52,2020-01-03,Ethiopia,0,0,Africa,0.000000,0.000000
75,2020-01-03,Italy,0,0,Europe,0.000000,0.000000
80,2020-01-03,Australia,0,0,Oceania,0.000000,0.000000
81,2020-01-03,United States,0,0,America,0.000000,0.000000
...,...,...,...,...,...,...,...
81703,2020-12-31,New Zealand,11,0,Oceania,0.228110,0.000000
81761,2020-12-31,United States,208904,3351,America,63.112485,1.012379
81764,2020-12-31,India,21822,299,Asia,1.581299,0.021667
81806,2020-12-31,Australia,31,0,Oceania,0.121569,0.000000


# First plot

In [10]:
import altair as alt
import pandas as pd

# Load data
country_data = selected_data

# Aggregate data by date and region on the non-filtered data
agg_data = full_clean_data.groupby(['Date', 'Continent'], as_index=False)['New cases'].agg(['sum']).reset_index()

# Calculate the same ratio than above for the total continents 
agg_data['Cases_Ratio'] = agg_data['sum'] / agg_data['Continent'].map(population_per_continent)*1e5

# Caculate a moving average, and the associated standard deviation (to get trends for the whole continent)
agg_data['MA'] = agg_data.groupby('Continent')['Cases_Ratio'].transform(lambda x: x.rolling(window=15, min_periods=1).mean())

agg_data['SD'] = agg_data.groupby('Continent')['Cases_Ratio'].transform(lambda x: x.rolling(window=15, min_periods=1).std())

# Calculate upper and lower bounds : We want  to plot a trend curve with a shaded area
agg_data['upper'] = agg_data['MA'] + 2*agg_data['SD']
agg_data['lower'] = agg_data['MA'] - 2*agg_data['SD']


# Create a line chart for the mean values (trend for the whole continent)
trend_avg = alt.Chart(agg_data).mark_line(color='#FEEEBD', opacity=1).encode(
    x=alt.X('Date:T', title='Date'),
    y=alt.Y('MA:Q', title='Number of cases for 100 000 inhab', axis=alt.Axis(tickMinStep=10)),
    color=alt.Color('Continent:N', legend=alt.Legend(title='Continent')),
)

# Create a shaded area for the upper and lower bounds (trend for the whole continent)
trend_std_dev = alt.Chart(agg_data).mark_area(opacity=0.2).encode(
    x=alt.X('Date:T'),
    y=alt.Y('lower:Q', title=''),
    y2='upper:Q',
    color=alt.Color('Continent:N', legend=None),
)


"""
# Create a smoothed filling around trend_avg with a width of the standard deviation of number of new cases per continent
trend_std_dev = trend_avg.mark_area(
    color='#FEEEBD', opacity=0.2
).transform_window(
    rolling_mean='mean(Cases_Ratio)',
    frame=[-100, 100] # Use a rolling average over 7 days
).encode(
    x='Date:T',
    y=alt.Y('rolling_mean:Q', title='New cases ratio', axis=alt.Axis(format='~s')),
)

"""
# Create a scatter plot for countries in the selected continent

continent_dropdown = alt.binding_select(options=country_data['Continent'].unique().tolist(), name='Continent: ')
continent_select = alt.selection_single(fields=['Continent'], bind=continent_dropdown, init={'Continent': 'Europe'})

scatter = alt.Chart(country_data).mark_circle(opacity=0.7, size=50).encode(
    x=alt.X('Date:T', title='Date'),
    y=alt.Y('Cases_Ratio:Q', title=''),
    color=alt.Color('Country:N', legend=alt.Legend(title='Country')),
    tooltip=[
        alt.Tooltip(field='Country', title='Country'),
        alt.Tooltip(field='New cases', title='New cases'),
        alt.Tooltip(field='New deaths', title='New deaths'),
        alt.Tooltip(field='Date', title='Date', type='temporal', format='%Y-%m-%d')
    ]
).properties(
    width=600,
    height=400,
    title='Number of new cases for 100 000 inhab, by country by day, for the selected continent'
).add_selection(
    continent_select
).transform_filter(
    continent_select
).interactive()

"""
lines = alt.Chart(country_data).mark_line(opacity=0.5).encode(
    x=alt.X('Date:T', title='Date'),
    y=alt.Y('New cases:Q', title='New cases'),
    color=alt.Color('Country:N', legend=alt.Legend(title='Country'))
).properties(
    width=600,
    height=400
)


lines_filtered = lines.transform_filter(
    continent_select
)

"""





# Filter aggregated data by selected continent
trend_avg_filtered = trend_avg.transform_filter(
    continent_select
)

trend_std_dev_filtered = trend_std_dev.transform_filter(
    continent_select
)

# Create a chart for the dropdown menu
dropdown = alt.Chart(country_data).mark_text(fontSize=14, font='Helvetica').encode(
    text=alt.Text('Continent:N')
).transform_filter(
    continent_select
).properties(
    width=100,
    height=400
)

# Combine the scatter plot, trend line, and dropdown selection button
chart = (alt.layer(scatter, trend_avg_filtered, trend_std_dev_filtered).resolve_axis(x='shared') | dropdown).configure_axis(
    grid=True
)


chart

# Second plot

In [11]:
selected_day_wise = full_clean_data.groupby(['Date', 'Continent']).agg({'New cases': 'sum'}).reset_index()

In [12]:
selected_day_wise = full_clean_data.groupby(['Date', 'Continent']).agg({'New cases': 'sum'}).reset_index()

population_per_continent = {'Africa': 1.34e9,
                            'America' : 1.01e9,
                            'Asia': 4.64e9,
                            'Europe': 746e6,
                            'Oceania' : 43e6}



selected_day_wise['Cases_Ratio'] = selected_day_wise['New cases'] / selected_day_wise['Continent'].map(population_per_continent)

In [13]:
# Objective : Comparing continent on a selected period
# We plot the number of new_cases per continent, we add a selection tool and we link the plot to a bar chart

interval = alt.selection_interval()
circle = alt.Chart(selected_day_wise).mark_circle().encode(
    x='monthdate(Date):O',
    y='Continent',
    color=alt.condition(interval, 'Continent', alt.value('lightgray')),
    size=alt.Size('New cases:Q',
        scale=alt.Scale(range=[0, 3000]),
        legend=alt.Legend(title='Daily new cases')
    )
).properties(
    width=700,
    height=300,
    selection=interval,
    title='Daily New Cases' # added title
)
bars = alt.Chart(selected_day_wise).mark_bar().encode(
    y='Continent',
    color='Continent',
    x='sum(New cases):Q'
).properties(
    width=700,
    title='Sum of daily new detected cases during the selected period' # added title
).transform_filter(
    interval
)
circle & bars



# Third plot

In [14]:
df = pd.read_csv('/content/owid-covid-data.csv')

In [15]:
# Featuring on data from 2020 
df['year'] = pd.DatetimeIndex(df['date']).year
df = df[df['year'] == 2020]

In [16]:
# Dropping NA values
df = df.dropna(subset=['total_deaths_per_million', 'new_cases_per_million'])

# Aggregating all the data from 2020 by countries
df = df.sort_values('date').groupby('location').agg({'new_cases' : 'sum', 'new_deaths':'sum', 'population': 'mean'})


df['new_cases'] = df['new_cases'].apply(int)
df['new_deaths'] = df['new_deaths'].apply(int)

# Creating the rations (number of affected people per 100 000 inhab)
df['cases_ratio'] = df['new_cases']/df['population']*1e5
df['deaths_ratio'] = (df['new_deaths']/df['population'])*1e5

df['cases_ratio'] = df['cases_ratio'].apply(int)
df['deaths_ratio'] = df['deaths_ratio'].apply(int)

df = df.reset_index()
df['location'] = df['location'].replace('United States', 'United States of America')

In [18]:
import pandas as pd
import altair as alt

alt.data_transformers.disable_max_rows()

url = "https://raw.githubusercontent.com/deldersveld/topojson/master/world-countries-sans-antarctica.json"

# data_map = alt.topo_feature(url, "continent_Europe_subunits")
data_map = alt.topo_feature(url, "countries1")


map_chart = alt.Chart(data_map).mark_geoshape().encode(
    tooltip=[alt.Tooltip('properties.name:N', title = "Country "), alt.Tooltip('deaths_ratio:Q', title = "Total deaths per 100 000 inhab in 2020"), alt.Tooltip('cases_ratio:Q', title = "Total cases per 100 000 inhab in 2020")],
    color=alt.Color('deaths_ratio:Q', scale=alt.Scale(scheme='yelloworangebrown'), legend=alt.Legend(title = 'Number of deaths per' +  '\n'  + '100 000 inhab.'))
).project('mercator').properties(
    width=800,
    height=500
).transform_lookup(
    lookup='properties.name',
    from_=alt.LookupData(df, 'location', ['location', 'cases_ratio','deaths_ratio' ])#. # , 'period_str'
).properties(
    width=800,
    height=500, 
    title = 'Total deaths due to Covid-19 per 100 000 inhabitants in 2020'
)

map_chart

# Rendering the plot in a html file: 

In [19]:
charts_template = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>TITRE</title>
    <link rel="stylesheet" href="mystyle.css">
      <script src="https://cdn.jsdelivr.net/npm/vega@{vega_version}"></script>
  <script src="https://cdn.jsdelivr.net/npm/vega-lite@{vegalite_version}"></script>
  <script src="https://cdn.jsdelivr.net/npm/vega-embed@{vegaembed_version}"></script>

   
</head>
<body>
    <header>
        <h1>Understanding COVID-19 spread in the world in 2020</h1>
        <h3> 
        
          Explore the journey of COVID-19's impact across the continents with our set of three informative plots. These visualizations present patterns and trends of the pandemic in a neutral manner, allowing for a clear understanding of the virus's progression and consequences. The first plot displays the number of new cases per 100,000 inhabitants by country on a day-to-day basis. The second plot demonstrates the cumulative effect of daily infections during the selected period. Lastly, a map illustrates the total number of deaths country by country, highlighting the human cost of the pandemic. By examining this trio of complementary visualizations, gain valuable insight into the pandemic's multifaceted impact on the continent, fostering a deeper analysis of the virus's reach and repercussions.

        </h3>
    </header>
    <div class="container">

    <div class="plot">
      <h1> About the data we used </h1>
      <p>We used the data produced by the famous platform Our World In Data : https://ourworldindata.org/covid-vaccinations?. 
      
      The figures gathered in the dataset all come from official sources. Covid-related figures come from the governments of the countries themselves. Keep this in mind, as some countries may not communicate their true figures, for instance for geopolitical reasons. Also, some countries can have more difficulties to gather true figures about the evolution of the covid in their country, due to lack of medical infrastructure for instance. 
      </p>

    </div>
        <div class="plot">
            <h2>Spread of the virus in each continent</h2>
            <p>Here is the number of new COVID-19 cases per 100,000 inhabitants by country by day for a selected continent. </p>
            <p>It allows us to analyze the infection rate across previously selected countries within the continent and identify patterns or trends.</p>
            
            <p>By observing the fluctuations, we can identify the effectiveness of healthcare systems in different countries</p>
                <div id="vis1"></div>
           
        </div>

        <div class="plot">
            <h2>Comparison of the spread in each continent</h2>
            <p>To get a more precise idea about the comparison of the evolution throughout all of the continents. You can select a specific time range by clicking and hovering to select an area on the chart.</p>

             <p>With this tool you can clearly see what was the impact of the covid on a given date. You can see that Europe was stroke really hard in October, while the spread in Asia was more smooth and did not happen specifically in one month.</p>
                <div id="vis2"></div>
            
        </div>

        <div class="plot">
            <h2>The death toll of covid in Europe</h2>
              <p>The third plot is a map illustrating the total number of deaths by country. This visualization is compelling as it demonstrates the human toll of the pandemic and allows us to compare the mortality rates across countries. It helps us to identify countries that have been more severely impacted and may need additional support.</p>
              
              <p>An interesting point to see is that rich countries seem to have been more more impacted than the rest of the world. A important point though is that it may be explained by the fact that rich countries can more easily count their covid death toll, due to good medical infrastructures.</p>

              <p>Another interesting point is the case of China, which seems to not have been impacted by covid. It may be explained by a geopolitical will to keep their figures secret as it was relayed on the press. </p>
              
              <p>The map complements the first two plots by adding another dimension to our understanding of the pandemic: while the first plot focuses on the infection rate and the second plot shows the total number of cases, the third plot brings attention to the ultimate consequence of the virus – the loss of lives. By analyzing all three plots together, we can gain a comprehensive understanding of the pandemic's impact on the selected continent, both in terms of infections and deaths.</p>
                <div id="vis3"></div>

            
        </div>
    </div>

    <script type="text/javascript">
        vegaEmbed('#vis1', {spec1}).catch(console.error);
        vegaEmbed('#vis2', {spec2}).catch(console.error);
        vegaEmbed('#vis3', {spec3}).catch(console.error);  


      </script>

</body>
</html>
"""


combined_chart = circle & bars


with open('sample_data/charts.html', 'w') as f:
    f.write(charts_template.format(
        vega_version=alt.VEGA_VERSION,
        vegalite_version=alt.VEGALITE_VERSION,
        vegaembed_version=alt.VEGAEMBED_VERSION,
        spec1=chart.to_json(indent=None),
        spec2=combined_chart.to_json(indent=None),
        spec3=map_chart.to_json(indent=None),
    ))