In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA

In [None]:
class Analysis:
    def __init__(self, df):
        self.df = df

    def eda(self):
        print(self.df.isnull().sum())
        print(self.df.describe())
        self.df.info()
        print(self.df.head())

    def outlier(self,feature):
        q1 = self.df[feature].quantile(0.25)
        q3 = self.df[feature].quantile(0.75)
        iqr = q3-q1
        mask = (self.df[feature]<q1-iqr*1.5) | (self.df[feature]>q3+iqr*1.5)
        outliers=self.df[mask]
        print(outliers)

    def correlation(self):
        corr = self.df.corr(numeric_only=True)
        return corr

    def heatmap(self):
        plt.figure(figsize=(10,8))
        sns.heatmap(self.correlation(), cmap="coolwarm",center=0)
        plt.show()

    def forecast_daily(self, date_col, feature, steps=30):
        series = self.df.set_index(date_col)[feature].sort_index()
        series = series.asfreq("D")
        model = ARIMA(series, order=(1,1,1))
        fit = model.fit()
        return fit.forecast(steps=steps)

    def forecast_annual(self, year_col, feature, steps=10):
        series = self.df.set_index(year_col)[feature].sort_index()
        series.index = pd.PeriodIndex(series.index.astype(int), freq="Y")
        model = ARIMA(series, order=(1,1,1))
        fit = model.fit()
        return fit.forecast(steps=steps)

    def forecast_monthly(self, year_col, month_col, feature, steps=12):
        series = self.df.copy()
        series["date"] = pd.to_datetime(
            dict(year=series[year_col], month=series[month_col], day=1)
        )
        series = series.set_index("date")[feature].sort_index()
        series = series.asfreq("MS")  # month start
        model = ARIMA(series, order=(1, 1, 1))
        fit = model.fit()
        return fit.forecast(steps=steps)

Notebook overview:
- Load CO2 datasets from Our World in Data and Mauna Loa.
- Preview each table to confirm column names.
- Plot monthly, annual, growth-rate, and daily trend series with labels.
- Add uncertainty bands where available.


In [None]:
df_owid = pd.read_csv("../dataset/co2: Our World In Data/owid-co2-data.csv")
df_ml_monthly_mean = pd.read_csv('../dataset/co2: Mauna Loa/co2_mm_gl.csv', skiprows=38)
df_ml_annual_mean = pd.read_csv("../dataset/co2: Mauna Loa/co2_annmean_gl.csv", skiprows=37)
df_ml_annual_global_rate = pd.read_csv('../dataset/co2: Mauna Loa/co2_gr_gl.csv',skiprows=43)
df_ml_daily_global_trend = pd.read_csv('../dataset/co2: Mauna Loa/co2_trend_gl.csv', skiprows=36)

In [None]:
df_owid.head()

In [None]:
df_country = df_owid.dropna(subset=["year", "co2"]).sort_values("year")
source_cols = [
    "coal_co2",
    "oil_co2",
    "gas_co2",
    "cement_co2",
    "flaring_co2",
    "land_use_change_co2",
    "other_industry_co2",
]
for country, g in df_country.groupby("country"):
    plt.figure(figsize=(10, 4))
    plt.plot(g["year"], g["co2"], label='CO2 (Mt)')
    for col in source_cols:
        if col in g.columns and g[col].notna().any():
            label = col.replace("_co2", "").replace("_", " ").title() + " (Mt)"
            plt.plot(g["year"], g[col], label=label)

    plt.plot(g["year"], g["co2_including_luc"], label="co2 including land use change (Mt)")

    plt.title(f"CO2 Emissions - {country}")
    plt.suptitle("Gaps indicate missing data", y=0.98, fontsize=9)
    plt.xlabel("Year")
    plt.ylabel("CO2 (million tonnes)")
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.legend()
    plt.savefig(f"../output/countries_co2_emission_graph/{country}.png", dpi=150)
    plt.show()

Monthly mean CO2 (global marine surface):
- Use `average` as the main series.
- `average_unc` provides +/- uncertainty for a shaded band.


In [None]:
df1 = Analysis(df_ml_monthly_mean)
df1.eda()
df1.correlation()
df1.outlier('average')
df1.heatmap()
df1.forecast_monthly('year', 'month', 'average')

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df_ml_monthly_mean['decimal'],df_ml_monthly_mean['average'], label='Average')
upper = df_ml_monthly_mean['average'] + df_ml_monthly_mean['average_unc']
lower = df_ml_monthly_mean['average'] - df_ml_monthly_mean['average_unc']
plt.fill_between(df_ml_monthly_mean['decimal'],lower,upper, color="lightcoral", alpha=0.25, label="Uncertainty")

plt.plot(df_ml_monthly_mean['decimal'],df_ml_monthly_mean['trend'], label='Trend')
upper = df_ml_monthly_mean['trend'] + df_ml_monthly_mean['trend_unc']
lower = df_ml_monthly_mean['trend'] - df_ml_monthly_mean['trend_unc']
plt.fill_between(df_ml_monthly_mean['decimal'],lower,upper, color="lightcoral", alpha=0.25, label="Uncertainty")

plt.title("Monthly Average")
plt.xlabel("Year")
plt.ylabel("CO2 (PPM)")
plt.grid(True)
plt.legend()
plt.show()

Annual mean CO2 (global marine surface):
- Use `year` and `mean` for the line plot.
- This is a yearly aggregation of the Mauna Loa series.


In [None]:
df2 = Analysis(df_ml_annual_mean)
df2.eda()
df2.correlation()
df2.outlier('mean')
df2.heatmap()
df2.forecast_annual('year', 'mean')

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df_ml_annual_mean['year'],df_ml_annual_mean['mean'], label='Mean')
plt.title("Annual Average")
plt.xlabel("Year")
plt.ylabel("CO2 (PPM)")
plt.grid(True)
plt.legend()
plt.show()

Annual growth rate of CO2:
- `ann inc` is the year-over-year increase.
- Useful to compare acceleration or slowdowns over time.


In [None]:
df3 = Analysis(df_ml_annual_global_rate)
df3.eda()
df3.correlation()
df3.outlier('ann inc')
df3.heatmap()
df3.forecast_annual('year', 'ann inc')

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df_ml_annual_global_rate['year'],df_ml_annual_global_rate['ann inc'], label='Annual Average')
plt.title("Annual Increase")
plt.xlabel("Year")
plt.ylabel("CO2 (PPM)")
plt.grid(True)
plt.legend()
plt.show()

Daily global trend series:
- Plot both `smoothed` and `trend` for context.
- Use an index or datetime column for time on the x-axis.


In [None]:
df4 = Analysis(df_ml_daily_global_trend)
df4.eda()
df4.correlation()
df4.outlier('trend')
df4.heatmap()

df_ml_daily_global_trend["date"] = pd.to_datetime(df_ml_daily_global_trend[["year", "month", "day"]])
df4.forecast_daily('date', 'trend')

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df_ml_daily_global_trend.index,df_ml_daily_global_trend['smoothed'], label='Smoothed')
plt.plot(df_ml_daily_global_trend.index,df_ml_daily_global_trend['trend'], label='Trend')
plt.title("Global Trend")
plt.xlabel("Year")
plt.ylabel("CO2 (PPM)")
plt.grid(True)
plt.legend()
plt.show()