In [8]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

# GitHub raw URL base
github_url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"
raw_base_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/"

# Create a directory to store downloaded files
os.makedirs("covid_data", exist_ok=True)

# Get the list of CSV files from the GitHub directory
response = requests.get(github_url)
soup = BeautifulSoup(response.text, 'html.parser')

csv_files = [
    link.text for link in soup.find_all('a', href=True)
    if link.text.endswith(".csv")
]

# Download each CSV file
for csv_file in csv_files:
    file_url = raw_base_url + csv_file
    file_path = os.path.join("covid_data", csv_file)
    
    print(f"Downloading {csv_file}...")
    
    response = requests.get(file_url)
    if response.status_code == 200:
        with open(file_path, "wb") as f:
            f.write(response.content)
    else:
        print(f"Failed to download {csv_file}")

print("All files downloaded.")

# Load all CSV files into a single DataFrame
all_cases = []
for csv_file in csv_files:
    file_path = os.path.join("covid_data", csv_file)
    df = pd.read_csv(file_path, sep=",")
    df["Date"] = csv_file.replace(".csv", "")  # Extract date from filename
    all_cases.append(df)

# Concatenate all DataFrames
cases = pd.concat(all_cases, ignore_index=True)

# Save merged dataset
cases.to_csv("covid_data/merged_cases.csv", index=False)
print("Merged dataset saved as covid_data/merged_cases.csv")


Downloading 01-01-2021.csv...
Downloading 01-01-2021.csv...
Downloading 01-01-2022.csv...
Downloading 01-01-2022.csv...
Downloading 01-01-2023.csv...
Downloading 01-01-2023.csv...
Downloading 01-02-2021.csv...
Downloading 01-02-2021.csv...
Downloading 01-02-2022.csv...
Downloading 01-02-2022.csv...
Downloading 01-02-2023.csv...
Downloading 01-02-2023.csv...
Downloading 01-03-2021.csv...
Downloading 01-03-2021.csv...
Downloading 01-03-2022.csv...
Downloading 01-03-2022.csv...
Downloading 01-03-2023.csv...
Downloading 01-03-2023.csv...
Downloading 01-04-2021.csv...
Downloading 01-04-2021.csv...
Downloading 01-04-2022.csv...
Downloading 01-04-2022.csv...
Downloading 01-04-2023.csv...
Downloading 01-04-2023.csv...
Downloading 01-05-2021.csv...
Downloading 01-05-2021.csv...
Downloading 01-05-2022.csv...
Downloading 01-05-2022.csv...
Downloading 01-05-2023.csv...
Downloading 01-05-2023.csv...
Downloading 01-06-2021.csv...
Downloading 01-06-2021.csv...
Downloading 01-06-2022.csv...
Downloadin