<a href="https://colab.research.google.com/github/CooperJB710/DS2002-Data-Project-1/blob/main/Data_Project_ETL_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install mysql-connector-python

In [None]:
import sys
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mysql.connector

In [None]:
url = ""

In [None]:
def extract_cdc_vaccination_data():
    """Fetch CDC COVID-19 vaccination data as JSON -> DataFrame."""
    querystring = {}
    headers = {}
    try:
        response = requests.request("GET", url, headers=headers, params=querystring, timeout=30)
        response.raise_for_status()
        data_json = response.json()
    except requests.exceptions.RequestException as e:
        print("ERROR: Could not retrieve CDC data. Detail:", e)
        sys.exit(1)
    df = pd.DataFrame(data_json)
    return df


In [None]:
def load_census_population_data(csv_path):
    """Load local U.S. Census population data from CSV -> DataFrame."""
    try:
        df = pd.read_csv(csv_path)
    except FileNotFoundError:
        print(f"ERROR: File not found -> {csv_path}")
        sys.exit(1)
    except pd.errors.ParserError as e:
        print("ERROR: Parse error in CSV. Detail:", e)
        sys.exit(1)
    except Exception as e:
        print("ERROR: Unknown error reading CSV. Detail:", e)
        sys.exit(1)
    return df

In [None]:
def transform_data(df_cdc, df_census):
    """Clean, transform, and merge CDC + Census data on state-level info."""
    df_cdc = df_cdc.rename(columns={
        "recip_state": "StateAbbrev",
        "series_complete_yes": "Series_Complete"
    })
    df_cdc["Series_Complete"] = pd.to_numeric(df_cdc["Series_Complete"], errors="coerce")
    df_cdc_agg = df_cdc.groupby("StateAbbrev", as_index=False).agg({"Series_Complete": "sum"})
    df_cdc_agg = df_cdc_agg.rename(columns={"Series_Complete": "Total_Vaccinated"})

    df_census = df_census.rename(columns={
        "STATE_ABBR_COL": "StateAbbrev",
        "POP_COL": "Population"
    })
    df_census["Population"] = pd.to_numeric(df_census["Population"], errors="coerce")
    df_census = df_census.dropna(subset=["Population","StateAbbrev"])

    df_merged = pd.merge(df_cdc_agg, df_census, on="StateAbbrev", how="inner")
    df_merged["Vaccination_Rate"] = df_merged["Total_Vaccinated"] / df_merged["Population"] * 100.0
    return df_merged

In [None]:
def analyze_data(df_merged):
    """Display stats and plot a sample bar chart."""
    print("Records:", len(df_merged), "| Columns:", len(df_merged.columns))
    print(df_merged.head())
    print(df_merged[["Total_Vaccinated","Population","Vaccination_Rate"]].describe())

    df_sorted = df_merged.sort_values("Vaccination_Rate", ascending=False).head(10)
    plt.figure(figsize=(10,6))
    plt.bar(df_sorted["StateAbbrev"], df_sorted["Vaccination_Rate"])
    plt.title("Top 10 States by Vaccination Rate")
    plt.xlabel("State")
    plt.ylabel("Vaccination Rate (%)")
    plt.tight_layout()
    plt.savefig("top_10_states_vax_rate.png")
    print("Saved chart: top_10_states_vax_rate.png")

In [None]:
def store_data_in_mysql(final_df):
    """Create table in MySQL and insert final data."""
    try:
        connection = mysql.connector.connect(
            host="localhost",
            user="root",
            password="YOUR_PASSWORD_HERE",
            database="YOUR_DATABASE_HERE"
        )
        cursor = connection.cursor()
    except mysql.connector.Error as err:
        print("ERROR: MySQL connection failed. Detail:", err)
        sys.exit(1)

    create_tbl = """
        CREATE TABLE IF NOT EXISTS covid_vaccination_data (
            id INT AUTO_INCREMENT PRIMARY KEY,
            state_abbrev VARCHAR(10),
            total_vaccinated DOUBLE,
            population DOUBLE,
            vaccination_rate DOUBLE
        );
    """
    try:
        cursor.execute("DROP TABLE IF EXISTS covid_vaccination_data;")
        cursor.execute(create_tbl)
    except mysql.connector.Error as err:
        print("ERROR: Table creation failed. Detail:", err)
        connection.close()
        sys.exit(1)

    insert_q = """
        INSERT INTO covid_vaccination_data
        (state_abbrev, total_vaccinated, population, vaccination_rate)
        VALUES (%s, %s, %s, %s);
    """
    records = []
    for _, row in final_df.iterrows():
        records.append((
            row["StateAbbrev"],
            row["Total_Vaccinated"] if not pd.isna(row["Total_Vaccinated"]) else None,
            row["Population"] if not pd.isna(row["Population"]) else None,
            row["Vaccination_Rate"] if not pd.isna(row["Vaccination_Rate"]) else None
        ))
    try:
        cursor.executemany(insert_q, records)
        connection.commit()
        print("Inserted:", cursor.rowcount, "rows.")
    except mysql.connector.Error as err:
        print("ERROR: Insertion failed. Detail:", err)
        connection.close()
        sys.exit(1)

    cursor.close()
    connection.close()

In [None]:
def main():
    print("Extracting from CDC API...")
    df_cdc = extract_cdc_vaccination_data()

    print("Loading Census CSV...")
    csv_path = "path/to/us_census_population_data.csv"  # Replace with real path
    df_census = load_census_population_data(csv_path)

    print("Transforming & merging data...")
    df_merged = transform_data(df_cdc, df_census)

    analyze_data(df_merged)

    print("Storing in MySQL...")
    store_data_in_mysql(df_merged)

    print("ETL complete.")

if __name__ == "__main__":
    main()