In [None]:
import math

import matplotlib.pyplot as plt
import pandas as pd

from models import Paths

UNMODIFIED_DATA: pd.DataFrame = pd.read_csv(Paths.OHE / "combined-data.csv")


In [None]:
data = UNMODIFIED_DATA.groupby("Year", as_index=False).mean()


In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

data.Temperature.diff().describe()


In [None]:
plot_acf(data.Temperature.diff().dropna())


In [None]:
f = plt.figure()
ax1 = f.add_subplot(121)
ax1.set_title('2nd order differencing')
ax1.plot(data.Temperature.diff().diff().dropna())

ax2 = f.add_subplot(122)
plot_acf(data.Temperature.diff().diff().dropna(), ax=ax2)

plt.show()


In [None]:
f = plt.figure()
ax1 = f.add_subplot(121)
ax1.set_title('1st order differencing')
ax1.plot(data.Temperature.diff().dropna())

ax2 = f.add_subplot(122)
plot_acf(data.Temperature.diff().dropna(), ax=ax2)

plt.show()


In [None]:
from statsmodels.tsa.stattools import adfuller

result = adfuller(data.Temperature)
print("p-value: ".ljust(20), result[1])

result = adfuller(data.Temperature.diff().dropna())
print("1st diff p-value: ".ljust(20), result[1])

result = adfuller(data.Temperature.diff().diff().dropna())
print("2nd diff p-value: ".ljust(20), result[1])


In [None]:
# 1st difference is beyond the 0.05 test threshold, so that is our d value
# d = 1

f = plt.figure()
ax1 = f.add_subplot(121)
ax1.set_title('0th order differencing')
ax1.plot(data.Temperature)

ax2 = f.add_subplot(122)
plot_pacf(data.Temperature, ax=ax2)

plt.show()


In [None]:
f = plt.figure()
ax1 = f.add_subplot(121)
ax1.set_title('1st order differencing')
ax1.plot(data.Temperature.diff().dropna())

ax2 = f.add_subplot(122)
plot_pacf(data.Temperature.diff().dropna(), ax=ax2)

plt.show()


In [None]:
# 1st bar is most significant in both autocorrelations, so we set p = 1

# We use the formula ARIMA(p = 1, d=1, q=2)

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, shuffle=False, test_size=0.3)

arima_model = ARIMA(train_data.Temperature, order=(1, 1, 2), exog=train_data.Co2)
model = arima_model.fit()
print(model.summary())
# train_data.head()


In [None]:
p = model.predict()

# p.head()
# print(p.tolist())
# plt.scatter(data.Year-1970, data.Temperature, color="blue")
plt.scatter(train_data.Year - 1970, train_data.Temperature, color="blue")
plt.scatter(p.index, p.tolist(), color="red")
# plt.show()

# plt.plot(model.simulate(20000, anchor=49))

plt.show()


In [None]:
forecast = model.forecast(steps=len(test_data), exog=test_data.Co2)

# Plot the forecasted / test data
plt.scatter(forecast.index, forecast)

# plot the previously trained data as interpreted by the model
plt.scatter(p.index, p.tolist(), color="red")

plt.scatter(data.Year - 1970, data.Temperature, color="green")

plt.show()


In [None]:
# Determine the accuracy of the future predictions based on the actual data we have for the future

from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

rmse = sqrt(mean_squared_error(test_data.Temperature, forecast))
r2 = r2_score(test_data.Temperature, forecast)

print("RMSE of forecast: ".ljust(20), rmse)
print("R2 of forecast: ".ljust(20), r2)


In [None]:
data = UNMODIFIED_DATA
cc = [c for c in data.columns if c.startswith("Country_")]

maxdg = -math.inf
mindg = +math.inf
maxdc = None
mindc = None

for c in cc:
    q = f"`{c}` == 1"
    entries = data.query(q)

    diffs = entries

    maxd = max(diffs.Co2)
    mind = min(diffs.Co2)
    if maxd > maxdg:
        maxdg = maxd
        maxdc = c
    if mind < mindg:
        mindg = mind
        mindc = c

print(maxdc, mindc)
print(maxdg, mindg)


In [None]:
data = data.query("`Country_United States` == 1").dropna()

plt.plot(data.Year, data.Co2)

plt.show()


In [None]:
data = UNMODIFIED_DATA.groupby("Year", as_index=False).mean()

plt.scatter(data.Year, data.Temperature)

plt.show()
