In [2]:
# Challenge 1 - prepare dataset for normalisation

import pandas as pd

# URL to the Excel file
url = "https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true"

# Load the data from the 'by_country' sheet
migration_df = pd.read_excel(url, sheet_name='by_country')

# Use .info() to check for missing values
migration_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14994 entries, 0 to 14993
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   country                              14994 non-null  object        
 1   iso_code                             14994 non-null  object        
 2   date                                 14994 non-null  datetime64[ns]
 3   total_vaccinations                   9011 non-null   float64       
 4   people_vaccinated                    8370 non-null   float64       
 5   people_fully_vaccinated              6158 non-null   float64       
 6   daily_vaccinations_raw               7575 non-null   float64       
 7   daily_vaccinations                   14796 non-null  float64       
 8   total_vaccinations_per_hundred       9011 non-null   float64       
 9   people_vaccinated_per_hundred        8370 non-null   float64       
 10  people_ful

In [4]:
# Challenge 1 - Clean the data
import pandas as pd

# Function to clean the data and perform the required operations
def clean_data(df):
    # 1. Remove all rows with missing data in the total_vaccinations column and display the shape
    df_cleaned = df.dropna(subset=['total_vaccinations'])
    cleaned_shape = df_cleaned.shape
    
    # 2. Find the median total vaccinations per hundred
    median_total_vaccinations_per_hundred = df_cleaned['total_vaccinations_per_hundred'].median()
    
    # 3. Display the mean people vaccinated per hundred for each country in descending order
    mean_people_vaccinated_per_hundred = df_cleaned.groupby('country')['people_vaccinated_per_hundred'].mean().sort_values(ascending=False)
    
    # 4. Find the range of total_vaccinations across the dataframe
    range_total_vaccinations = df_cleaned['total_vaccinations'].max() - df_cleaned['total_vaccinations'].min()
    
    return cleaned_shape, median_total_vaccinations_per_hundred, mean_people_vaccinated_per_hundred, range_total_vaccinations

# Assuming migration_df has already been loaded
url = "https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true"
migration_df = pd.read_excel(url, sheet_name='by_country')

# Calling the function with the dataset
cleaned_shape, median_total_vaccinations_per_hundred, mean_people_vaccinated_per_hundred, range_total_vaccinations = clean_data(migration_df)

print(cleaned_shape)
print(median_total_vaccinations_per_hundred)
print(mean_people_vaccinated_per_hundred.head())
print(range_total_vaccinations)


(9011, 15)
6.3
country
Gibraltar           64.975699
Bhutan              55.961892
Falkland Islands    51.063333
Saint Helena        44.880000
Seychelles          44.005686
Name: people_vaccinated_per_hundred, dtype: float64
275338000.0


In [8]:
# Challenge 1 - Creating new columns in a dataframe

import pandas as pd

# URL to the Excel file
url = "https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true"

# Load the data from the 'by_country' sheet
migration_df = pd.read_excel(url, sheet_name='by_country')

# Function to clean the data and perform the required operations
def clean_data(df):
    # 1. Remove all rows with missing data in the total_vaccinations column and display the shape
    df_cleaned = df.dropna(subset=['total_vaccinations'])
    cleaned_shape = df_cleaned.shape
    
    # 2. Find the median total vaccinations per hundred
    median_total_vaccinations_per_hundred = df_cleaned['total_vaccinations_per_hundred'].median()
    
    # 3. Display the mean people vaccinated per hundred for each country in descending order
    mean_people_vaccinated_per_hundred = df_cleaned.groupby('country')['people_vaccinated_per_hundred'].mean().sort_values(ascending=False)
    
    # 4. Find the range of total_vaccinations across the dataframe
    range_total_vaccinations = df_cleaned['total_vaccinations'].max() - df_cleaned['total_vaccinations'].min()
    
    return df_cleaned, cleaned_shape, median_total_vaccinations_per_hundred, mean_people_vaccinated_per_hundred, range_total_vaccinations

# Clean the data
migration_df, cleaned_shape, median_total_vaccinations_per_hundred, mean_people_vaccinated_per_hundred, range_total_vaccinations = clean_data(migration_df)

# 5. Duplicate the 'total_vaccinations' column
migration_df['total_vaccinations_copy'] = migration_df['total_vaccinations']

# 6. Add 'total_vaccinations' and 'people_vaccinated' columns together
migration_df['vaccinations_plus_people_vaccinated'] = migration_df['total_vaccinations'] + migration_df['people_vaccinated']

# 7. Create a percentages column for 'people_vaccinated'
migration_df['people_vaccinated_percentage'] = (migration_df['people_vaccinated'] / migration_df['people_vaccinated'].sum()) * 100

# 8. Create a normalized column for 'total_vaccinations'
def normalize_total_vaccinations(row):
    return (row['total_vaccinations'] - migration_df['total_vaccinations'].min()) / (migration_df['total_vaccinations'].max() - migration_df['total_vaccinations'].min())

migration_df['total_vaccinations_normalized'] = migration_df.apply(normalize_total_vaccinations, axis=1)

# 9. Remove rows with any NaN values
migration_df = migration_df.dropna()

# Print the results
print("Cleaned Data Shape:", cleaned_shape)
print("Median Total Vaccinations per Hundred:", median_total_vaccinations_per_hundred)
print("Mean People Vaccinated per Hundred (Top 5):")
print(mean_people_vaccinated_per_hundred.head())
print("Range of Total Vaccinations:", range_total_vaccinations)
print("\nDataFrame with New Columns and No NaN Rows:\n", migration_df.head())



Cleaned Data Shape: (9011, 15)
Median Total Vaccinations per Hundred: 6.3
Mean People Vaccinated per Hundred (Top 5):
country
Gibraltar           64.975699
Bhutan              55.961892
Falkland Islands    51.063333
Saint Helena        44.880000
Seychelles          44.005686
Name: people_vaccinated_per_hundred, dtype: float64
Range of Total Vaccinations: 275338000.0

DataFrame with New Columns and No NaN Rows:
        country iso_code       date  total_vaccinations  people_vaccinated  \
99     Albania      ALB 2021-02-18              3049.0             2438.0   
264    Andorra      AND 2021-04-05             15269.0             9781.0   
500  Argentina      ARG 2021-01-21            265724.0           249372.0   
501  Argentina      ARG 2021-01-22            279602.0           254456.0   
502  Argentina      ARG 2021-01-23            288064.0           258876.0   

     people_fully_vaccinated  daily_vaccinations_raw  daily_vaccinations  \
99                     611.0                  

In [14]:
# Challenge 2 - normalise daily vaccinations

import pandas as pd

# URL to the Excel file
url = "https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true"

# Load the data from the 'by_country' sheet
migration_df = pd.read_excel(url, sheet_name='by_country')

# Clean the data by removing rows with NaN values in the 'total_vaccinations' column and making a deep copy
migration_df_cleaned = migration_df.dropna(subset=['total_vaccinations']).copy()

# Calculate the median of 'daily_vaccinations_per_million'
median_daily_vaccinations_per_million = migration_df_cleaned['daily_vaccinations_per_million'].median()

# Function to normalize 'daily_vaccinations_per_million'
def normalize_daily_vaccinations_per_million(value, median):
    if pd.notna(value):  # Ignore NaN values
        return 1 if value >= median else 0
    return value

# Apply the normalization function using .loc to avoid the SettingWithCopyWarning
migration_df_cleaned.loc[:, 'daily_vaccinations_per_million_normalized'] = migration_df_cleaned['daily_vaccinations_per_million'].apply(
    lambda x: normalize_daily_vaccinations_per_million(x, median_daily_vaccinations_per_million)
)

# Display the median value
print("Median Daily Vaccinations per Million:", median_daily_vaccinations_per_million)

# Display the description of the normalized column
print(migration_df_cleaned['daily_vaccinations_per_million_normalized'].describe())



Median Daily Vaccinations per Million: 1915.5
count    8816.000000
mean        0.500000
std         0.500028
min         0.000000
25%         0.000000
50%         0.500000
75%         1.000000
max         1.000000
Name: daily_vaccinations_per_million_normalized, dtype: float64


In [20]:


import pandas as pd

# URL to the Excel file
url = "https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true"

# Read the Excel file
df = pd.read_excel(url)

# Remove rows with any NaN values
df = df.dropna()

# Verify the column names and first few rows to understand the structure
print(df.head())
print(df.columns)

# Find the minimum total vaccinations for the United Kingdom
uk_min_vaccinations = df[df['country'] == 'United Kingdom']['total_vaccinations'].min()

# Round it down to an integer (not strictly necessary if it's already an integer)
uk_min_vaccinations = int(uk_min_vaccinations)

# Print the UK's minimum total vaccinations
print(uk_min_vaccinations)

# Define the normalization function
def normalize_vaccinations(total_vaccinations, min_vaccinations):
    return 1 if total_vaccinations >= min_vaccinations else 0

# Apply the normalization function to the DataFrame
df['normalised_tv'] = df['total_vaccinations'].apply(lambda x: normalize_vaccinations(x, uk_min_vaccinations))

# Display the normalized total vaccinations column
print(df['normalised_tv'].head())

# Display the countries for which total vaccinated is at the same rate or more than the UK
countries_with_more_or_equal_vaccinations = df[df['total_vaccinations'] >= uk_min_vaccinations]['country'].unique()

# Print countries with total vaccinations >= UK's minimum
print("Countries with total vaccinations >= UK's minimum:")
print(countries_with_more_or_equal_vaccinations)



       country iso_code       date  total_vaccinations  people_vaccinated  \
99     Albania      ALB 2021-02-18              3049.0             2438.0   
264    Andorra      AND 2021-04-05             15269.0             9781.0   
500  Argentina      ARG 2021-01-21            265724.0           249372.0   
501  Argentina      ARG 2021-01-22            279602.0           254456.0   
502  Argentina      ARG 2021-01-23            288064.0           258876.0   

     people_fully_vaccinated  daily_vaccinations_raw  daily_vaccinations  \
99                     611.0                  1348.0               254.0   
264                   4484.0                  2829.0               573.0   
500                  16352.0                 17791.0             11704.0   
501                  25146.0                 13878.0             11263.0   
502                  29188.0                  8462.0             11124.0   

     total_vaccinations_per_hundred  people_vaccinated_per_hundred  \
99        

In [24]:
# Challenge 4 - create new series of total vaccinations for each manufacturer
import pandas as pd

# URL to the Excel file
url = "https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true"

# Read the specific sheet 'by_manufacturer' from the Excel file
df_by_manufacturer = pd.read_excel(url, sheet_name='by_manufacturer')

# Display the first few rows of the DataFrame
print(df_by_manufacturer.head())

# Display information about the DataFrame
print(df_by_manufacturer.info())

# Display the column names to understand the structure
print(df_by_manufacturer.columns)


  location       date          vaccine  total_vaccinations
0    Chile 2020-12-24  Pfizer/BioNTech                 420
1    Chile 2020-12-25  Pfizer/BioNTech                5198
2    Chile 2020-12-26  Pfizer/BioNTech                8338
3    Chile 2020-12-27  Pfizer/BioNTech                8649
4    Chile 2020-12-28  Pfizer/BioNTech                8649
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3296 entries, 0 to 3295
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   location            3296 non-null   object        
 1   date                3296 non-null   datetime64[ns]
 2   vaccine             3296 non-null   object        
 3   total_vaccinations  3296 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 103.1+ KB
None
Index(['location', 'date', 'vaccine', 'total_vaccinations'], dtype='object')


In [28]:
# Challenge 4 - Clean the data
import pandas as pd

# URL to the Excel file
url = "https://github.com/lilaceri/Working-with-data-/blob/342abab10d93c4bf23b5c55a50f189f12a137c5f/Data%20Sets%20for%20code%20divisio/Covid%20Vaccination%20Data.xlsx?raw=true"

# Read the 'by_manufacturer' sheet
df_by_manufacturer = pd.read_excel(url, sheet_name='by_manufacturer')

# Display the first few rows and info to understand the structure
print(df_by_manufacturer.head())
print(df_by_manufacturer.info())
print(df_by_manufacturer.columns)

# Check if 'manufacturer' is indeed the correct column name
# For example, you might see something like 'Manufacturer', 'Company', etc.
# Adjust the column names based on the output above

# For demonstration, let's assume the column names are 'vaccine' and 'total_vaccinations'
df_by_manufacturer = df_by_manufacturer.rename(columns={'vaccine': 'manufacturer'})  # Adjust if needed

# Remove rows with NaN values if necessary
df_by_manufacturer = df_by_manufacturer.dropna()

# Find the sum of total vaccinations for each manufacturer
manufacturer_totals = df_by_manufacturer.groupby('manufacturer')['total_vaccinations'].sum()

# Display the total vaccinations per manufacturer
print(manufacturer_totals)

# Calculate the overall sum of total vaccinations
overall_sum = manufacturer_totals.sum()

# Create a DataFrame to calculate percentages
df_percentages = manufacturer_totals.reset_index()
df_percentages['percentages'] = df_percentages['total_vaccinations'] / overall_sum

# Find the median of these percentages
median_percentage = df_percentages['percentages'].median()

# Create a new column 'normalised_percentages'
df_percentages['normalised_percentages'] = df_percentages['percentages'].apply(lambda x: 1 if x >= median_percentage else 0)

# Display the updated DataFrame
print(df_percentages)

# Assuming you have another DataFrame with location, date, vaccine, and total_vaccinations
# Let's create an example DataFrame for demonstration
df_vaccination_data = pd.DataFrame({
    'location': ['Chile', 'Chile', 'Chile', 'Chile', 'Chile', 'Chile', 'Chile', 'Chile', 'Chile', 'Chile'],
    'date': pd.date_range(start='2020-12-24', periods=10),
    'vaccine': ['Pfizer/BioNTech'] * 10,
    'total_vaccinations': [420, 5198, 8338, 8649, 8649, 8649, 8649, 8649, 8649, 8649]
})

# Merge with the percentages data
df_vaccination_data = df_vaccination_data.merge(df_percentages[['manufacturer', 'percentages']], 
                                                left_on='vaccine', 
                                                right_on='manufacturer', 
                                                how='left')

# Drop unnecessary columns
df_vaccination_data = df_vaccination_data.drop(columns=['manufacturer'])

# Create the 'normalised_percentages' column
df_vaccination_data['normalised_percentages'] = df_vaccination_data['percentages'].apply(lambda x: 1 if x >= median_percentage else 0)

# Display the final DataFrame
print(df_vaccination_data)


  location       date          vaccine  total_vaccinations
0    Chile 2020-12-24  Pfizer/BioNTech                 420
1    Chile 2020-12-25  Pfizer/BioNTech                5198
2    Chile 2020-12-26  Pfizer/BioNTech                8338
3    Chile 2020-12-27  Pfizer/BioNTech                8649
4    Chile 2020-12-28  Pfizer/BioNTech                8649
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3296 entries, 0 to 3295
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   location            3296 non-null   object        
 1   date                3296 non-null   datetime64[ns]
 2   vaccine             3296 non-null   object        
 3   total_vaccinations  3296 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 103.1+ KB
None
Index(['location', 'date', 'vaccine', 'total_vaccinations'], dtype='object')
manufacturer
Johnson&Johnson        264839828
Moder