In [9]:
# Challenge 1 - prepare dataset for encoding
import pandas as pd

# Read Covid vaccination data from the specified sheet in the Excel file
url = "https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true"
df = pd.read_excel(url, sheet_name='by_country')

# Display initial dataframe info
print("Dataframe is saved in a variable")
print("\nInitial Dataframe Info:")
print(df.info())

# Task 1: Find out which columns have missing values
def check_missing_values(dataframe):
    missing_values = dataframe.isnull().sum()
    missing_columns = missing_values[missing_values > 0]
    print("\nColumns with Missing Values:")
    print(missing_columns)

check_missing_values(df)

# Task 2: Remove all rows with missing data in the total_vaccinations column
df_cleaned_total_vaccinations = df.dropna(subset=['total_vaccinations'])
print(f"\nRows after removing missing data in 'total_vaccinations': {df_cleaned_total_vaccinations.shape[0]}")

# Task 3: Remove all rows with missing data in the daily_vaccinations_per_million column
df_cleaned_daily_vaccinations_per_million = df_cleaned_total_vaccinations.dropna(subset=['daily_vaccinations_per_million'])
print(f"Rows after removing missing data in 'daily_vaccinations_per_million': {df_cleaned_daily_vaccinations_per_million.shape[0]}")

# Task 4: Find the median daily_vaccinations_per_million
median_daily_vaccinations_per_million = df_cleaned_daily_vaccinations_per_million['daily_vaccinations_per_million'].median()
print(f"\nMedian of 'daily_vaccinations_per_million': {median_daily_vaccinations_per_million}")

# Display the resulting dataframe info after cleaning
print("\nFinal Dataframe Info:")
print(df_cleaned_daily_vaccinations_per_million.info())




Dataframe is saved in a variable

Initial Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14994 entries, 0 to 14993
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   country                              14994 non-null  object        
 1   iso_code                             14994 non-null  object        
 2   date                                 14994 non-null  datetime64[ns]
 3   total_vaccinations                   9011 non-null   float64       
 4   people_vaccinated                    8370 non-null   float64       
 5   people_fully_vaccinated              6158 non-null   float64       
 6   daily_vaccinations_raw               7575 non-null   float64       
 7   daily_vaccinations                   14796 non-null  float64       
 8   total_vaccinations_per_hundred       9011 non-null   float64       
 9   people_vaccinated_per_hun

In [15]:
# Challenge 2 - encode daily vaccinations

import pandas as pd

def encode_daily(df):
    # Ensure the 'daily_vaccinations_per_million' column exists
    if 'daily_vaccinations_per_million' not in df.columns:
        raise ValueError("DataFrame must contain a 'daily_vaccinations_per_million' column")
    
    # Compute the median of the 'daily_vaccinations_per_million' column
    median_value = df['daily_vaccinations_per_million'].median()
    
    # Create a new column in a copy of the dataframe to avoid SettingWithCopyWarning
    df = df.copy()  # Ensure we're working with a copy, not a view
    df['daily_vaccinations_encoded'] = df['daily_vaccinations_per_million'].apply(lambda x: 1 if x >= median_value else 0)
    
    return df

# Example usage
# Read Covid vaccination data from the specified sheet in the Excel file
url = "https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true"
df = pd.read_excel(url, sheet_name='by_country')

# Clean the dataframe as per previous tasks
df_cleaned_total_vaccinations = df.dropna(subset=['total_vaccinations'])
df_cleaned_daily_vaccinations_per_million = df_cleaned_total_vaccinations.dropna(subset=['daily_vaccinations_per_million'])

# Apply encoding function
df_encoded = encode_daily(df_cleaned_daily_vaccinations_per_million)

# Output the describe() for the encoded column
print("\nDescription of Encoded 'daily_vaccinations_per_million':")
print(df_encoded['daily_vaccinations_encoded'].describe())




Description of Encoded 'daily_vaccinations_per_million':
count    8816.000000
mean        0.500000
std         0.500028
min         0.000000
25%         0.000000
50%         0.500000
75%         1.000000
max         1.000000
Name: daily_vaccinations_encoded, dtype: float64


In [23]:
# Challenge 3 - Encoding total vaccinations
import pandas as pd

# Read Covid vaccination data from the specified sheet in the Excel file
url = "https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true"
df = pd.read_excel(url, sheet_name='by_country')

# Clean the dataframe to remove rows with NaN values in 'total_vaccinations'
df_cleaned = df.dropna(subset=['total_vaccinations'])

# Task 1: Find the minimum total vaccinations for the United Kingdom
uk_min_vaccinations = df_cleaned[df_cleaned['country'] == 'United Kingdom']['total_vaccinations'].min()
print(f"Minimum total vaccinations for the United Kingdom: {uk_min_vaccinations}")

# Task 2: Write a function to encode total_vaccinations column
def encode_total_vaccinations(df, min_vaccinations):
    # Create a copy of the dataframe to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Use .loc to avoid SettingWithCopyWarning
    df.loc[:, 'total_vaccinations_encoded'] = df['total_vaccinations'].apply(lambda x: 1 if x >= min_vaccinations else 0)
    
    return df

# Apply the encoding function
df_encoded = encode_total_vaccinations(df_cleaned, uk_min_vaccinations)

# Task 3: Display unique countries where total vaccinations are at the same rate or more than the UK
countries_with_high_vaccinations = df_encoded[df_encoded['total_vaccinations_encoded'] == 1]['country'].unique()
print("\nCountries with total vaccinations at the same rate or more than the UK:")
print(countries_with_high_vaccinations)

# Display describe() for the 'total_vaccinations_encoded' column
print("\nDescribe of 'total_vaccinations_encoded':")
print(df_encoded['total_vaccinations_encoded'].describe())


Minimum total vaccinations for the United Kingdom: 1402432.0

Countries with total vaccinations at the same rate or more than the UK:
['Argentina' 'Australia' 'Austria' 'Azerbaijan' 'Bangladesh' 'Belgium'
 'Brazil' 'Cambodia' 'Canada' 'Chile' 'China' 'Colombia' 'Czechia'
 'Denmark' 'Dominican Republic' 'England' 'Finland' 'France' 'Germany'
 'Greece' 'Hong Kong' 'Hungary' 'India' 'Indonesia' 'Ireland' 'Israel'
 'Italy' 'Japan' 'Kazakhstan' 'Malaysia' 'Mexico' 'Morocco' 'Nepal'
 'Netherlands' 'Norway' 'Pakistan' 'Peru' 'Philippines' 'Poland'
 'Portugal' 'Qatar' 'Romania' 'Russia' 'Saudi Arabia' 'Scotland' 'Serbia'
 'Singapore' 'Slovakia' 'South Korea' 'Spain' 'Sweden' 'Switzerland'
 'Thailand' 'Turkey' 'United Arab Emirates' 'United Kingdom'
 'United States' 'Uruguay' 'Wales']

Describe of 'total_vaccinations_encoded':
count    9011.00000
mean        0.29808
std         0.45744
min         0.00000
25%         0.00000
50%         0.00000
75%         1.00000
max         1.00000
Name: tota

In [27]:

import pandas as pd

# URL of the Excel file
url = "https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true"

# Read the 'by_manufacturer' sheet
df = pd.read_excel(url, sheet_name='by_manufacturer')

# Task 1: Find the sum of total vaccinations for each manufacturer
total_vaccinations_by_vaccine = df.groupby('vaccine')['total_vaccinations'].sum()
print(total_vaccinations_by_vaccine)

# Task 2: Calculate the overall sum of total vaccinations
overall_total_vaccinations = df['total_vaccinations'].sum()

# Task 3: Create a new column for percentage of total vaccinations
df['percentage_of_total'] = (df['total_vaccinations'] / overall_total_vaccinations) * 100

# Task 4: Find the median percentage
median_percentage = df['percentage_of_total'].median()
print(f"Median Percentage: {median_percentage}")

# Task 5: Create a new column 'encoded_percentages' which duplicates 'percentage_of_total'
df['encoded_percentages'] = df['percentage_of_total']

# Encode the 'encoded_percentages' column
df['encoded_percentages'] = df['encoded_percentages'].apply(lambda x: 1 if x >= median_percentage else 0)

# Display the updated DataFrame
print(df.head())


vaccine
Johnson&Johnson        264839828
Moderna               5548036383
Oxford/AstraZeneca     539433203
Pfizer/BioNTech       8690461304
Sinovac                604660293
Name: total_vaccinations, dtype: int64
Median Percentage: 0.0011110194374896931
  location       date          vaccine  total_vaccinations  \
0    Chile 2020-12-24  Pfizer/BioNTech                 420   
1    Chile 2020-12-25  Pfizer/BioNTech                5198   
2    Chile 2020-12-26  Pfizer/BioNTech                8338   
3    Chile 2020-12-27  Pfizer/BioNTech                8649   
4    Chile 2020-12-28  Pfizer/BioNTech                8649   

   percentage_of_total  encoded_percentages  
0             0.000003                    0  
1             0.000033                    0  
2             0.000053                    0  
3             0.000055                    0  
4             0.000055                    0  


In [45]:
# Challenge 4 - create new series of total vaccinations for each manufacturer

import pandas as pd

# URL of the Excel file
url = "https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true"

# Read the 'by_manufacturer' sheet
df = pd.read_excel(url, sheet_name='by_manufacturer')

# Task 1: Find the sum of total vaccinations for each manufacturer
total_vaccinations_by_vaccine = df.groupby('vaccine')['total_vaccinations'].sum()
print(total_vaccinations_by_vaccine)

# Task 2: Calculate the overall sum of total vaccinations
overall_total_vaccinations = df['total_vaccinations'].sum()

# Task 3: Create a new column for percentage of total vaccinations
df['percentage_of_total'] = (df['total_vaccinations'] / overall_total_vaccinations)

# Task 4: Find the median percentage
median_percentage = df['percentage_of_total'].median()
print(f"Median Percentage: {median_percentage}")

# Task 5: Create a new column 'encoded_percentages' which duplicates 'percentage_of_total'
df['encoded_percentages'] = df['percentage_of_total']

# Encode the 'encoded_percentages' column
df['encoded_percentages'] = df['encoded_percentages'].apply(lambda x: 1 if x >= median_percentage else 0)

# Display the updated DataFrame with first 10 rows for brevity
print(df.head(10))  # Displaying first 10 rows for brevity


vaccine
Johnson&Johnson        264839828
Moderna               5548036383
Oxford/AstraZeneca     539433203
Pfizer/BioNTech       8690461304
Sinovac                604660293
Name: total_vaccinations, dtype: int64
Median Percentage: 1.1110194374896931e-05
  location       date          vaccine  total_vaccinations  \
0    Chile 2020-12-24  Pfizer/BioNTech                 420   
1    Chile 2020-12-25  Pfizer/BioNTech                5198   
2    Chile 2020-12-26  Pfizer/BioNTech                8338   
3    Chile 2020-12-27  Pfizer/BioNTech                8649   
4    Chile 2020-12-28  Pfizer/BioNTech                8649   
5    Chile 2020-12-29  Pfizer/BioNTech                8649   
6    Chile 2020-12-30  Pfizer/BioNTech                8649   
7    Chile 2020-12-31  Pfizer/BioNTech                8649   
8    Chile 2021-01-01  Pfizer/BioNTech                8649   
9    Chile 2021-01-02  Pfizer/BioNTech                8649   

   percentage_of_total  encoded_percentages  
0         2.68414

In [51]:
# Exercise 8 - encode daily vaccinations

import pandas as pd

# Step 1: Load the dataset from the provided URL
url = "https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true"
xls = pd.ExcelFile(url)

# Step 2: Check the available sheet names and use the correct one
print("Available sheet names:", xls.sheet_names)

# Assuming 'Sheet1' contains the data we need
df = pd.read_excel(xls, sheet_name=xls.sheet_names[0])  # Use the first sheet

# Step 3: Calculate the median of the 'daily_vaccinations_per_million' column
median_daily_vaccinations_per_million = df['daily_vaccinations_per_million'].median()
print("Median daily vaccinations per million:", median_daily_vaccinations_per_million)

# Step 4: Define a function to encode daily vaccinations per 1 million
def encode_vaccinations(value, median):
    return 1 if value >= median else 0

# Step 5: Apply the function to the 'daily_vaccinations_per_million' column
encoded_vaccinations = df['daily_vaccinations_per_million'].apply(encode_vaccinations, median=median_daily_vaccinations_per_million)

# Step 6: Display the encoded results
print(encoded_vaccinations)




Available sheet names: ['by_country', 'by_manufacturer', 'Sheet1']
Median daily vaccinations per million: 1475.0
0        0
1        0
2        0
3        0
4        0
        ..
14989    0
14990    0
14991    0
14992    0
14993    0
Name: daily_vaccinations_per_million, Length: 14994, dtype: int64


In [61]:
# Exercise 9 - Encoding vaccinations per hundred
import pandas as pd
import numpy as np

# Step 1: Load the dataset from the provided URL
url = "https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true"
xls = pd.ExcelFile(url)

# Load the data from the first sheet
df = pd.read_excel(xls, sheet_name=xls.sheet_names[0])

# Step 2: Print the column names to identify the correct ones
print("Column Names:", df.columns)

# Step 3: Adjust these variables based on the actual column names from the print output
location_column = 'country'  # Replace with the actual name of the location column
vaccinations_column = 'total_vaccinations'  # Replace with the actual name of the total vaccinations column

# Step 4: Find the minimum total vaccinations for the United Kingdom
uk_min_vaccinations = df[df[location_column] == 'United Kingdom'][vaccinations_column].min()
uk_min_vaccinations_rounded = int(uk_min_vaccinations)  # Round down to an integer
print("Minimum total vaccinations for the UK:", uk_min_vaccinations)
print("Rounded minimum vaccinations:", uk_min_vaccinations_rounded)

# Step 5: Write a function to encode the total_vaccinations column
def encode_vaccinations(total_vaccinations, uk_min):
    return np.where(total_vaccinations >= uk_min, 1, 0)

df['encoded_vaccinations'] = encode_vaccinations(df[vaccinations_column], uk_min_vaccinations_rounded)

# Step 6: Display the encoded column
print(df['encoded_vaccinations'])

# Step 7: Display the countries where total vaccinations are at the same rate or more than the UK
countries_with_high_vaccinations = df[df['encoded_vaccinations'] == 1][location_column].unique()
print("Countries with vaccinations at the same rate or higher than the UK:")
print(countries_with_high_vaccinations)



Column Names: Index(['country', 'iso_code', 'date', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated',
       'daily_vaccinations_raw', 'daily_vaccinations',
       'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred',
       'people_fully_vaccinated_per_hundred', 'daily_vaccinations_per_million',
       'vaccines', 'source_name', 'source_website'],
      dtype='object')
Minimum total vaccinations for the UK: 1402432.0
Rounded minimum vaccinations: 1402432
0        0
1        0
2        0
3        0
4        0
        ..
14989    0
14990    0
14991    0
14992    0
14993    0
Name: encoded_vaccinations, Length: 14994, dtype: int32
Countries with vaccinations at the same rate or higher than the UK:
['Argentina' 'Australia' 'Austria' 'Azerbaijan' 'Bangladesh' 'Belgium'
 'Brazil' 'Cambodia' 'Canada' 'Chile' 'China' 'Colombia' 'Czechia'
 'Denmark' 'Dominican Republic' 'England' 'Finland' 'France' 'Germany'
 'Greece' 'Hong Kong' 'Hungary' 'India' 'Indon

In [63]:
# Exercise 10 - create new series of total vaccinations percentages

import pandas as pd
import numpy as np

# Step 1: Load the dataset from the provided URL
url = "https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true"
xls = pd.ExcelFile(url)

# Load the data from the 'by_manufacturer' sheet
df_by_manufacturer = pd.read_excel(xls, sheet_name='by_manufacturer')

# Display the first few rows and columns to understand its structure
print("Columns in by_manufacturer sheet:", df_by_manufacturer.columns)
print(df_by_manufacturer.head())

# Step 2: Find the sum of total vaccinations for each manufacturer
total_vaccinations_by_manufacturer = df_by_manufacturer.groupby('vaccine')['total_vaccinations'].sum()

# Display the sum of total vaccinations by manufacturer
print("Total vaccinations by manufacturer:")
print(total_vaccinations_by_manufacturer)

# Step 3: Calculate the percentage of total vaccinations
overall_total_vaccinations = total_vaccinations_by_manufacturer.sum()
df_by_manufacturer['percentages'] = (df_by_manufacturer['total_vaccinations'] / overall_total_vaccinations) * 100

# Display the first few rows to check percentages
print("Data with percentages column:")
print(df_by_manufacturer.head())

# Step 4: Find the median of the percentages column
median_percentage = df_by_manufacturer['percentages'].median()
print("Median percentage:", median_percentage)

# Step 5: Create 'encoded_percentages' column
df_by_manufacturer['encoded_percentages'] = df_by_manufacturer['percentages']

# Step 6: Encode 'encoded_percentages' column
df_by_manufacturer['encoded'] = np.where(df_by_manufacturer['encoded_percentages'] >= median_percentage, 1, 0)

# Display the updated DataFrame
print("Data with encoded percentages:")
print(df_by_manufacturer.head())


Columns in by_manufacturer sheet: Index(['location', 'date', 'vaccine', 'total_vaccinations'], dtype='object')
  location       date          vaccine  total_vaccinations
0    Chile 2020-12-24  Pfizer/BioNTech                 420
1    Chile 2020-12-25  Pfizer/BioNTech                5198
2    Chile 2020-12-26  Pfizer/BioNTech                8338
3    Chile 2020-12-27  Pfizer/BioNTech                8649
4    Chile 2020-12-28  Pfizer/BioNTech                8649
Total vaccinations by manufacturer:
vaccine
Johnson&Johnson        264839828
Moderna               5548036383
Oxford/AstraZeneca     539433203
Pfizer/BioNTech       8690461304
Sinovac                604660293
Name: total_vaccinations, dtype: int64
Data with percentages column:
  location       date          vaccine  total_vaccinations  percentages
0    Chile 2020-12-24  Pfizer/BioNTech                 420     0.000003
1    Chile 2020-12-25  Pfizer/BioNTech                5198     0.000033
2    Chile 2020-12-26  Pfizer/BioNTech   