Google colab link
https://colab.research.google.com/drive/1_JERw_L3pMvDkq1M_5CTli3yJHy4p_eE?usp=sharing


In [2]:
!pip install catboost xgboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

# Import models and cross-validation tools
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [4]:
# ----------------------------------------
# 1. Data Preparation
# ----------------------------------------
# Load the dataset and focus on the relevant columns
global_temp = pd.read_csv('/content/GlobalTemperatures.csv', parse_dates=['dt'])
global_temp = global_temp[['dt', 'LandAverageTemperature']].dropna(subset=['LandAverageTemperature'])

In [5]:
# Use data up to 2010-08-01 for training
train_data = global_temp[global_temp['dt'] <= pd.to_datetime("2010-08-01")].copy()

In [6]:
# Create time-based features
train_data['year'] = train_data['dt'].dt.year
train_data['month'] = train_data['dt'].dt.month
train_data['date_ordinal'] = train_data['dt'].map(datetime.toordinal)

# Define predictors and target
X_train = train_data[['year', 'month', 'date_ordinal']]
y_train = train_data['LandAverageTemperature']


In [7]:
# ----------------------------------------
# 2. Set Up Time Series Cross-Validation and Scorer
# ----------------------------------------
tscv = TimeSeriesSplit(n_splits=5)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
rmse_scorer = make_scorer(rmse, greater_is_better=False)

In [12]:
# ----------------------------------------
# 3. Define Models
# ----------------------------------------
# Base models
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    # "XGBoost": XGBRegressor(n_estimators=100, random_state=42, objective='reg:squarederror'),
    "ExtraTrees": ExtraTreesRegressor(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
    "CatBoost": CatBoostRegressor(iterations=100, random_state=42, verbose=0)
}

In [15]:

# Ensemble models
# For VotingRegressor and StackingRegressor, we select a subset of the base models.
voting_reg = VotingRegressor(estimators=[
    ("rf", RandomForestRegressor(n_estimators=100, random_state=42)),
    ("gb", GradientBoostingRegressor(n_estimators=100, random_state=42)),
    ("cat", CatBoostRegressor(iterations=100, random_state=42, verbose=0))
])
stacking_reg = StackingRegressor(
    estimators=[
        ("rf", RandomForestRegressor(n_estimators=100, random_state=42)),
        ("gb", GradientBoostingRegressor(n_estimators=100, random_state=42)),
        ("cat", CatBoostRegressor(iterations=100, random_state=42, verbose=0))
    ],
    final_estimator=Ridge()
)
stacking_reg2 = StackingRegressor(
    estimators=[
        ("rf", RandomForestRegressor(n_estimators=100, random_state=42)),
        ("gb", GradientBoostingRegressor(n_estimators=100, random_state=42)),
        ("cat", CatBoostRegressor(iterations=100, random_state=42, verbose=0))
    ],
    final_estimator=LinearRegression()
)
models["VotingRegressor"] = voting_reg
models["StackingRegressor"] = stacking_reg
models["StackingRegressor2"] = stacking_reg2

In [16]:
# ----------------------------------------
# 4. Evaluate All Models Using Cross-Validation
# ----------------------------------------
print("Model Performance (using TimeSeriesSplit and RMSE):")
model_performance = {}

for name, model in models.items():
    # Use cross_val_score with negative MSE and convert to RMSE
    scores = cross_val_score(model, X_train, y_train, cv=tscv, scoring="neg_mean_squared_error")
    rmse_vals = np.sqrt(-scores)
    model_performance[name] = rmse_vals.mean()
    print(f"{name}: RMSE = {rmse_vals.mean():.4f} ± {rmse_vals.std():.4f}")

Model Performance (using TimeSeriesSplit and RMSE):
LinearRegression: RMSE = 4.0782 ± 0.4711
Ridge: RMSE = 4.1553 ± 0.1097
DecisionTree: RMSE = 0.7845 ± 0.3483
RandomForest: RMSE = 0.7043 ± 0.3079
GradientBoosting: RMSE = 0.7260 ± 0.3548
ExtraTrees: RMSE = 0.7845 ± 0.3482
AdaBoost: RMSE = 1.1717 ± 0.1722
CatBoost: RMSE = 0.7533 ± 0.4107
VotingRegressor: RMSE = 0.7130 ± 0.3501
StackingRegressor: RMSE = 0.7878 ± 0.4323
StackingRegressor2: RMSE = 0.7878 ± 0.4320


In [17]:
# ----------------------------------------
# 5. Identify Best Performing Models
# ----------------------------------------
# Compare base models only
base_model_performance = {name: score for name, score in model_performance.items()
                          if name not in ["VotingRegressor", "StackingRegressor"]}
best_base_model = min(base_model_performance, key=base_model_performance.get)
print(f"\nBest performing base model: {best_base_model} with RMSE = {base_model_performance[best_base_model]:.4f}")

# Compare overall performance including ensembles
best_overall_model = min(model_performance, key=model_performance.get)
print(f"Best overall model (including ensembles): {best_overall_model} with RMSE = {model_performance[best_overall_model]:.4f}")


Best performing base model: RandomForest with RMSE = 0.7043
Best overall model (including ensembles): RandomForest with RMSE = 0.7043


In [18]:
# ----------------------------------------
# 6. (Optional) Fit the Best Model on the Full Training Data
# ----------------------------------------

best_model = models[best_overall_model]
best_model.fit(X_train, y_train)

In [19]:
# ----------------------------------------
# 7. Define the Prediction Function
# ----------------------------------------
def predict_temperatures(n):
    """
    Predict global LandAverageTemperature for the next n months after 2010-08-01.

    Parameters:
        n (int): Number of months to predict.

    Returns:
        DataFrame: A DataFrame with columns 'dt' and 'Predicted_LandAverageTemperature'
                   for the next n months.
    """
    # The training data ends on 2010-08-01, so the next month is 2010-09-01.
    start_date = pd.to_datetime("2010-08-01") + pd.DateOffset(months=1)
    future_dates = pd.date_range(start=start_date, periods=n, freq='MS')

    future_df = pd.DataFrame({'dt': future_dates})
    future_df['year'] = future_df['dt'].dt.year
    future_df['month'] = future_df['dt'].dt.month
    future_df['date_ordinal'] = future_df['dt'].map(datetime.toordinal)

    X_future = future_df[['year', 'month', 'date_ordinal']]
    future_df['LandAverageTemperature'] = best_model.predict(X_future)

    return future_df[['dt', 'LandAverageTemperature']]

# Example usage: Predict the next 12 months.
predictions = predict_temperatures(12)
print("\nPredictions for the next 12 months:")
print(predictions)


Predictions for the next 12 months:
           dt  LandAverageTemperature
0  2010-09-01                13.03364
1  2010-10-01                10.21994
2  2010-11-01                 7.09233
3  2010-12-01                 4.37799
4  2011-01-01                 3.72382
5  2011-02-01                 4.21080
6  2011-03-01                 6.55042
7  2011-04-01                 9.55097
8  2011-05-01                12.29834
9  2011-06-01                14.31928
10 2011-07-01                15.21599
11 2011-08-01                14.70578
