#### Import libraries and data

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import datetime

data = pd.read_csv("../data/historical_weather.csv")

#### Clean data

In [30]:
# rename columns to readable names
data.rename(columns={
  "temperature_2m_max (°C)": "max_temp",
  "temperature_2m_min (°C)": "min_temp",
  "temperature_2m_mean (°C)": "mean_temp",
  "precipitation_sum (mm)": "precipitation"
}, inplace=True)

# parse date into separate year, month, day
data["year"] = pd.to_datetime(data["time"]).dt.year
data["month"] = pd.to_datetime(data["time"]).dt.month
data["day"] = pd.to_datetime(data["time"]).dt.day

#### Prepare datasets for ML model

In [None]:
# X = input for ML prediction
# y = output for ML prediction
# example: predict temp for 2045-22-23 (X) = 25°C (y)
X = data[["year", "month", "day"]] # input: date
y = data[["mean_temp"]] # output: mean temp
# y = data[["max_temp", "min_temp", "mean_temp", "precipitation"]] # output: temps

# clean data: remove invalid dates
y = y.dropna()
X = X.iloc[y.index]

# generate datasets to train and test model (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Train and test ML model (Linear regression)

In [None]:
# train ML model using linear regression algorithm
lr = LinearRegression()
lr.fit(X_train, y_train)

# predict on train and test datasets
y_lr_train_pred = lr.predict(X_train)
y_lr_test_pred = lr.predict(X_test)

# prediction results
lr_train_mse = mean_squared_error(y_train, y_lr_train_pred)
lr_train_r2 = r2_score(y_train, y_lr_train_pred)
lr_test_mse = mean_squared_error(y_test, y_lr_test_pred)
lr_test_r2 = r2_score(y_test, y_lr_test_pred)

#### Prediction on real data (future dates)

In [None]:
# generate dates (X) to predict (dates after 2023)
start_date = datetime.date(2023, 1, 1)
end_date = datetime.date(2050, 12, 31)

dates = []
current_date = start_date
while current_date <= end_date:
  dates.append([current_date.year, current_date.month, current_date.day])
  current_date += datetime.timedelta(days=1)

# convert dates to dataframe
dates_to_predict = pd.DataFrame(dates)
dates_to_predict.rename(columns={
  0: "year",
  1: "month",
  2: "day"
}, inplace=True)

#### Predict temperature for future dates

In [None]:
# generate predictions using ML linear regression model
future_temps = lr.predict(dates_to_predict)

future_temps

array([[ 9.77421895],
       [ 9.7751808 ],
       [ 9.77614265],
       ...,
       [15.60939687],
       [15.61035872],
       [15.61132056]])

#### Visualizzare predizioni

In [None]:
# temperatura media per ogni anno
annual_average = data["mean_temp"].resample('Y').mean()
pd.DataFrame(annual_average)

# trend line
trend_line = annual_average.rolling(window=5, min_periods=1).mean()

# Plot the annual average using dots and the trendline
plt.figure(figsize=(10, 6))
plt.plot(
  annual_average.index,
  annual_average.values,
  marker='o',
  label='Annual Average Temperature',
  linestyle='none'
)
plt.plot(
  trend_line.index,
  trend_line.values,
  color='orange',
  label='Trend'
)
plt.xlabel('Year')
plt.ylabel('Average Temperature °C')
plt.title('Annual Average')
plt.legend()
plt.show()

NameError: name 'plt' is not defined