In [None]:
import pandas as pd
url = "https://raw.githubusercontent.com/SR1608/Datasets/main/coviddata.csv"
df = pd.read_csv(url)


In [None]:
# Number of rows and columns
num_rows, num_columns = df.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")


In [None]:
# Data types of columns
column_data_types = df.dtypes
print("Data types of columns:")
print(column_data_types)


In [None]:
# Information about the DataFrame
df_info = df.info()
df_description = df.describe()


In [None]:
unique_location_count = df['location'].nunique()
print(f"Count of unique values in 'location' column: {unique_location_count}")


In [None]:
continent_with_max_frequency = df['continent'].value_counts().idxmax()
max_frequency = df['continent'].value_counts().max()
print(f"Continent with maximum frequency: {continent_with_max_frequency} ({max_frequency} occurrences)")


In [None]:
max_total_cases = df['total_cases'].max()
mean_total_cases = df['total_cases'].mean()
print(f"Maximum total cases: {max_total_cases}")
print(f"Mean total cases: {mean_total_cases}")


In [None]:
quartiles = df['total_deaths'].quantile([0.25, 0.5, 0.75])
print("Quartile values for 'total_deaths':")
print(quartiles)


In [None]:
continent_with_max_hdi = df[df['human_development_index'] == df['human_development_index'].max()]['continent'].values[0]
max_hdi = df['human_development_index'].max()
print(f"Continent with maximum human development index: {continent_with_max_hdi} ({max_hdi})")


In [None]:
continent_with_min_gdp = df[df['gdp_per_capita'] == df['gdp_per_capita'].min()]['continent'].values[0]
min_gdp = df['gdp_per_capita'].min()
print(f"Continent with minimum GDP per capita: {continent_with_min_gdp} ({min_gdp})")


In [None]:
# List of columns to keep
columns_to_keep = ['continent', 'location', 'date', 'total_cases', 'total_deaths', 'gdp_per_capita', 'human_development_index']

# Filter the DataFrame to include only the specified columns
df = df[columns_to_keep]


In [None]:
# Remove duplicate rows
df = df.drop_duplicates()


In [None]:
# Find missing values in all columns
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)


In [None]:
# Remove rows with missing 'continent' values
df = df.dropna(subset=['continent'])


In [None]:
# Fill missing values with 0 for numeric columns
numeric_columns = ['total_cases', 'total_deaths', 'gdp_per_capita', 'human_development_index']
df[numeric_columns] = df[numeric_columns].fillna(0)


In [None]:
# Convert 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])


In [None]:
# Create a new column 'month' by extracting the month from the 'date' column
df['month'] = df['date'].dt.month


In [None]:
# Group by 'continent' and find max values in each group
df_groupby = df.groupby('continent').max().reset_index()


In [None]:
# Create the new feature 'total_deaths_to_total_cases'
df['total_deaths_to_total_cases'] = df['total_deaths'] / df['total_cases']


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
sns.distplot(df['gdp_per_capita'], kde=False, bins=30)
plt.title('Histogram of GDP per Capita')
plt.xlabel('GDP per Capita')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='gdp_per_capita', y='total_cases', data=df)
plt.title('Scatter Plot of Total Cases vs GDP per Capita')
plt.xlabel('GDP per Capita')
plt.ylabel('Total Cases')
plt.show()


In [None]:
# Bar plot
plt.figure(figsize=(12, 6))
sns.catplot(x='continent', y='total_cases', data=df_groupby, kind='bar', height=6, aspect=2)
plt.title('Bar Plot of Total Cases by Continent')
plt.xlabel('Continent')
plt.ylabel('Total Cases')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Save df_groupby to a CSV file
df_groupby.to_csv('df_groupby.csv', index=False)
