# COVID-19 Data Analysis Project

This notebook loads the COVID-19 confirmed cases dataset, prepares a clean time series, explores global and country trends, and generates visualizations.  
> Note: The assignment also refers to a Worldwide Happiness dataset. Add and merge it later if available (join on country names).

**Inputs:**
- `covid19_Confirmed_dataset.csv`

**Outputs:**
- `global_timeseries.csv`
- `top_countries_latest.csv`
- `india_timeseries.csv` (if India exists in the dataset)
- Charts saved in `plots/`


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plots_dir = "plots"
os.makedirs(plots_dir, exist_ok=True)

In [None]:
csv_path = "covid19_Confirmed_dataset.csv"
df_raw = pd.read_csv(csv_path)
df_raw.head()

In [None]:
# Detect columns and reshape
df = df_raw.copy()

country_cols_candidates = ["Country/Region", "Country_Region", "Country", "location", "Location"]
country_col = None
for c in country_cols_candidates:
    if c in df.columns:
        country_col = c
        break
if country_col is None:
    non_date = [c for c in df.columns if c.lower() not in ["lat", "long", "longitude", "latitude"]]
    candidates = [c for c in non_date if df[c].dtype == object]
    if candidates:
        country_col = candidates[0]
    else:
        country_col = "Country"
        df[country_col] = "Unknown"

prov_candidates = ["Province/State", "Province_State", "State/Province", "State"]
prov_col = None
for c in prov_candidates:
    if c in df.columns:
        prov_col = c
        break

date_cols = []
for c in df.columns:
    if c in [country_col, prov_col, "Lat", "Long", "Latitude", "Longitude", None]:
        continue
    try:
        _ = pd.to_datetime(c, errors="raise", infer_datetime_format=True)
        date_cols.append(c)
    except Exception:
        pass

if not date_cols:
    potential = []
    for c in df.columns:
        if c not in [country_col, prov_col, "Lat", "Long", "Latitude", "Longitude"]:
            numeric_ratio = pd.to_numeric(df[c], errors="coerce").notna().mean()
            if numeric_ratio > 0.95:
                potential.append(c)
    date_cols = potential

id_vars = [country_col] + ([prov_col] if prov_col else [])
df_long = df.melt(id_vars=id_vars, value_vars=date_cols, var_name="Date", value_name="Confirmed")
df_long["Date"] = pd.to_datetime(df_long["Date"], errors="coerce")
df_long = df_long.dropna(subset=["Date"])
df_long["Confirmed"] = pd.to_numeric(df_long["Confirmed"], errors="coerce").fillna(0)

df_country = (
    df_long.groupby([country_col, "Date"], as_index=False)["Confirmed"]
    .sum()
    .sort_values(["Date", country_col])
)

df_country.head()

In [None]:
# Global totals and daily new cases
global_ts = (
    df_country.groupby("Date", as_index=False)["Confirmed"]
    .sum()
    .rename(columns={"Confirmed": "GlobalConfirmed"})
    .sort_values("Date")
)
global_ts["NewCases"] = global_ts["GlobalConfirmed"].diff().fillna(0)
global_ts["MA7_NewCases"] = global_ts["NewCases"].rolling(7, min_periods=1).mean()
latest_date = df_country["Date"].max()

global_ts.to_csv("global_timeseries.csv", index=False)
latest_by_country = (
    df_country[df_country["Date"] == latest_date]
    .sort_values("Confirmed", ascending=False)
    .reset_index(drop=True)
    .rename(columns={df_country.columns[0]: "Country", "Confirmed": "TotalConfirmed"})
)
latest_by_country.to_csv("top_countries_latest.csv", index=False)
latest_date, latest_by_country.head(10)

In [None]:
# Plots (one per figure, default colors)
fig = plt.figure()
plt.plot(global_ts["Date"], global_ts["GlobalConfirmed"])
plt.title("Global Cumulative Confirmed Cases")
plt.xlabel("Date")
plt.ylabel("Total Confirmed")
plt.savefig(os.path.join(plots_dir, "global_cumulative.png"), bbox_inches="tight")
plt.close(fig)

fig = plt.figure()
plt.plot(global_ts["Date"], global_ts["NewCases"], label="Daily New Cases")
plt.plot(global_ts["Date"], global_ts["MA7_NewCases"], label="7-day MA")
plt.title("Global Daily New Cases")
plt.xlabel("Date")
plt.ylabel("New Cases")
plt.legend()
plt.savefig(os.path.join(plots_dir, "global_daily_new_cases.png"), bbox_inches="tight")
plt.close(fig)

top10 = latest_by_country.head(10)["Country"].tolist()
df_top10 = df_country[df_country[df_country.columns[0]].isin(top10)].copy()
pivot_top10 = df_top10.pivot(index="Date", columns=df_country.columns[0], values="Confirmed").sort_index()

fig = plt.figure()
for c in pivot_top10.columns:
    plt.plot(pivot_top10.index, pivot_top10[c], label=c)
plt.title("Top 10 Countries: Cumulative Confirmed")
plt.xlabel("Date")
plt.ylabel("Total Confirmed")
plt.legend()
plt.savefig(os.path.join(plots_dir, "top10_cumulative.png"), bbox_inches="tight")
plt.close(fig)

In [None]:
# India focus (if available)
country_focus = "India"
if country_focus in df_country[df_country.columns[0]].unique():
    india_ts = (
        df_country[df_country[df_country.columns[0]] == country_focus]
        .sort_values("Date")[["Date", "Confirmed"]]
        .rename(columns={"Confirmed": "IndiaConfirmed"})
        .reset_index(drop=True)
    )
    india_ts["NewCases"] = india_ts["IndiaConfirmed"].diff().fillna(0)
    india_ts["MA7_NewCases"] = india_ts["NewCases"].rolling(7, min_periods=1).mean()
    india_ts.to_csv("india_timeseries.csv", index=False)

    fig = plt.figure()
    plt.plot(india_ts["Date"], india_ts["IndiaConfirmed"])
    plt.title("India: Cumulative Confirmed Cases")
    plt.xlabel("Date")
    plt.ylabel("Total Confirmed")
    plt.savefig(os.path.join(plots_dir, "india_cumulative.png"), bbox_inches="tight")
    plt.close(fig)

    fig = plt.figure()
    plt.plot(india_ts["Date"], india_ts["NewCases"], label="Daily New Cases")
    plt.plot(india_ts["Date"], india_ts["MA7_NewCases"], label="7-day MA")
    plt.title("India: Daily New Cases")
    plt.xlabel("Date")
    plt.ylabel("New Cases")
    plt.legend()
    plt.savefig(os.path.join(plots_dir, "india_daily_new_cases.png"), bbox_inches="tight")
    plt.close(fig)