## PART A: ETL Operations

In [1]:
# Loading the libraries
import pandas as pd

In [2]:
# Load the datasets
gdp_df = pd.read_csv('gdp_1960_2020.csv')
covid_df = pd.read_csv('Country-wise-COVID-cases.csv')

# Display the first few rows of each dataset
print("GDP Dataset - First 5 Rows:")
print(gdp_df.head())

print("\nCOVID Dataset - First 5 Rows:")
print(covid_df.head())

GDP Dataset - First 5 Rows:
   year  rank            country    state           gdp  gdp_percent
0  1960     1  the United States  America  543300000000     0.468483
1  1960     2     United Kingdom   Europe   73233967692     0.063149
2  1960     3             France   Europe   62225478000     0.053656
3  1960     4              China     Asia   59716467625     0.051493
4  1960     5              Japan     Asia   44307342950     0.038206

COVID Dataset - First 5 Rows:
  Country Name  Total Infected  Total Deaths  Total Recovered  Death %  \
0          USA        47916190        783565       37918301.0      1.6   
1        India        34447536        463655       33849785.0      1.3   
2       Brazil        21957967        611318       21151342.0      2.8   
3           UK         9561099        142898        7825200.0      1.5   
4       Russia         9109094        256597        7812557.0      2.8   

   Recovered %  
0         79.1  
1         98.3  
2         96.3  
3         81.8

In [3]:
# Print the column names to diagnose the issue
print("GDP DataFrame Columns:", gdp_df.columns)
print("COVID DataFrame Columns:", covid_df.columns)


GDP DataFrame Columns: Index(['year', 'rank', 'country', 'state', 'gdp', 'gdp_percent'], dtype='object')
COVID DataFrame Columns: Index(['Country Name', 'Total Infected', 'Total Deaths', 'Total Recovered',
       'Death %', 'Recovered %'],
      dtype='object')


In [4]:
# Rename columns for consistency
gdp_df.rename(columns={'country': 'Country'}, inplace=True)
covid_df.rename(columns={'Country Name': 'Country'}, inplace=True)

In [5]:
gdp_df.isnull().sum()

year           0
rank           0
Country        0
state          0
gdp            0
gdp_percent    0
dtype: int64

covid_df.isnull().sum()

In [7]:
# Merge the datasets based on the 'Country' column
merged_df = pd.merge(covid_df, gdp_df[['Country', 'gdp']], on='Country', how='inner')

# Display the merged DataFrame
print("\nMerged Dataset - First 5 Rows:")
print(merged_df.head())


Merged Dataset - First 5 Rows:
  Country  Total Infected  Total Deaths  Total Recovered  Death %  \
0   India        34447536        463655       33849785.0      1.3   
1   India        34447536        463655       33849785.0      1.3   
2   India        34447536        463655       33849785.0      1.3   
3   India        34447536        463655       33849785.0      1.3   
4   India        34447536        463655       33849785.0      1.3   

   Recovered %          gdp  
0         98.3  37029883875  
1         98.3  39232435784  
2         98.3  42161481858  
3         98.3  48421923458  
4         98.3  56480289940  


## PART B: Big Data Analysis and Application of Engineering Techniques

In [8]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [9]:
# Import necessary PySpark libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.sql.types import StringType