In [34]:
%pip install 'chronos-forecasting>=2.0' 'pandas[pyarrow]' 'matplotlib'



In [35]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from chronos import BaseChronosPipeline
from chronos import ChronosPipeline

Using Amazon Chronos 2 to predict the population of Pakistan for the year 2000.


In [36]:
pipeline = BaseChronosPipeline.from_pretrained("amazon/chronos-2", device_map="cuda")

df = pd.read_csv('/content/Population.csv')

In [37]:
display(df.head())

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2000 [YR2000],2001 [YR2001],2002 [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],...,1969 [YR1969],1968 [YR1968],1967 [YR1967],1966 [YR1966],1965 [YR1965],1964 [YR1964],1963 [YR1963],1962 [YR1962],1961 [YR1961],1960 [YR1960]
0,"Population, total",SP.POP.TOTL,Afghanistan,AFG,20130327,20284307,21378117,22733049,23560654,24404567,...,11017409,10756922,10505959,10266395,10036008,9814318,9604487,9404406,9214083,9035043
1,"Population, total",SP.POP.TOTL,Albania,ALB,3089027,3060173,3051010,3039616,3026939,3011487,...,2081695,2022272,1965598,1914573,1864791,1814135,1762621,1711319,1659800,1608800
2,"Population, total",SP.POP.TOTL,Algeria,DZA,30903893,31331221,31750835,32175818,32628286,33109249,...,13475960,13172614,12876118,12590069,12365976,12179813,11982118,11800771,11628883,11424922
3,"Population, total",SP.POP.TOTL,American Samoa,ASM,56855,57053,57062,56971,56818,56617,...,26375,25614,24863,24136,23406,22673,21966,21298,20666,20133
4,"Population, total",SP.POP.TOTL,Andorra,AND,65685,65852,66506,69486,74325,77421,...,18555,17176,15837,14626,13634,12764,11915,11086,10283,9510


In [38]:

# Keep only population rows
pop = df[df["Series Name"] == "Population, total"].copy()

# Dropping columns that we dont need
pop = pop.drop(columns=["Series Name", "Series Code", "Country Code"])

pop_long = pop.melt(
    id_vars="Country Name",
    var_name="year_str",
    value_name="target"
)

pop_long["year"] = pop_long["year_str"].str.split(" ").str[0].astype(int)

pop_long["timestamp"] = pd.to_datetime(pop_long["year"], format="%Y")

pop_long = pop_long.rename(columns={"Country Name": "item_id"})

chronos_df = pop_long[["item_id", "timestamp", "target"]].copy()

chronos_df = chronos_df[chronos_df["target"] != ".."]
chronos_df["target"] = chronos_df["target"].astype(float)

chronos_df = chronos_df.sort_values(["item_id", "timestamp"]).reset_index(drop=True)
chronos_df = chronos_df[chronos_df["item_id"] == "Pakistan"]

chronos_df = chronos_df[chronos_df["target"] != "bbb"]  # Important!
chronos_df["target"] = pd.to_numeric(chronos_df["target"], errors='coerce')
chronos_df = chronos_df.dropna(subset=['target'])

print(f"Total data points after cleaning: {len(chronos_df)}")
print(f"Year range: {chronos_df['timestamp'].dt.year.min()} to {chronos_df['timestamp'].dt.year.max()}")


print("Final ready DataFrame:")
print(chronos_df.head())

Total data points after cleaning: 65
Year range: 1960 to 2024
Final ready DataFrame:
       item_id  timestamp      target
9685  Pakistan 1960-01-01  45709310.0
9686  Pakistan 1961-01-01  46921277.0
9687  Pakistan 1962-01-01  48156128.0
9688  Pakistan 1963-01-01  49447776.0
9689  Pakistan 1964-01-01  50799999.0


In [39]:
year_2000_data = chronos_df[
    (chronos_df["timestamp"].dt.year == 2000)
]
print(year_2000_data.head())


       item_id  timestamp       target
9725  Pakistan 2000-01-01  154879127.0


In [40]:
train_df = chronos_df[chronos_df["timestamp"].dt.year <= 1999]
test_df = chronos_df[chronos_df["timestamp"].dt.year >= 2000]

print(f"\nTraining data summary:")
print(f"Years: {train_df['timestamp'].dt.year.min()} to {train_df['timestamp'].dt.year.max()}")
print(f"Number of data points: {len(train_df)}")
print(f"\nLast 5 training values:")
print(train_df[['timestamp', 'target']].tail())


Training data summary:
Years: 1960 to 1999
Number of data points: 40

Last 5 training values:
      timestamp       target
9720 1995-01-01  134582253.0
9721 1996-01-01  138557092.0
9722 1997-01-01  142531804.0
9723 1998-01-01  146522356.0
9724 1999-01-01  150565377.0


In [41]:
# Step 2: Forecast 1 year ahead (→ year 2000)
c2_pred = pipeline.predict_df(
    train_df,
    prediction_length=5,
    quantile_levels=[0.1, 0.5, 0.9],
)

print(c2_pred)


    item_id  timestamp target_name  predictions          0.1          0.5  \
0  Pakistan 2000-01-01      target  160790864.0  156539168.0  160790864.0   
1  Pakistan 2001-01-01      target  164576720.0  160858624.0  164576720.0   
2  Pakistan 2002-01-01      target  168842576.0  163979424.0  168842576.0   
3  Pakistan 2003-01-01      target  173998560.0  167879856.0  173998560.0   
4  Pakistan 2004-01-01      target  177948064.0  171538720.0  177948064.0   

           0.9  
0  166560784.0  
1  169859296.0  
2  175017536.0  
3  179289456.0  
4  185101472.0  


In [42]:
# Extract predictions (median quantile)
predictions = c2_pred["0.5"].values  # Median predictions

actual_years = sorted(test_df['timestamp'].dt.year.unique())[:5]  # First 5 years
actual_values = []

for year in actual_years:
    val = test_df[test_df['timestamp'].dt.year == year]['target'].values
    if len(val) > 0:
        actual_values.append(val[0])
    else:
        actual_values.append(None)

print("\n" + "="*70)
print("CHRONOS-2 ERROR ANALYSIS")
print("="*70)

# Calculate errors for each year
absolute_errors = []
percentage_errors = []

for i, (year, pred_val, actual_val) in enumerate(zip(actual_years, predictions, actual_values)):
    if actual_val is not None:
        abs_error = abs(pred_val - actual_val)
        pct_error = (abs_error / actual_val) * 100

        absolute_errors.append(abs_error)
        percentage_errors.append(pct_error)

        print(f"\nYear {year}:")
        print(f"  Predicted: {pred_val:,.0f}")
        print(f"  Actual:    {actual_val:,.0f}")
        print(f"  Absolute Error: {abs_error:,.0f}")
        print(f"  Percentage Error: {pct_error:.2f}%")

# SUMMARY METRICS
print("\n" + "="*70)
print("SUMMARY METRICS")
print("="*70)

if len(absolute_errors) > 0:
    mae = np.mean(absolute_errors)  # Mean Absolute Error
    mape = np.mean(percentage_errors)  # Mean Absolute Percentage Error
    rmse = np.sqrt(np.mean([e**2 for e in absolute_errors]))  # Root Mean Squared Error

    print(f"Mean Absolute Error (MAE): {mae:,.0f}")
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
    print(f"Root Mean Squared Error (RMSE): {rmse:,.0f}")

    # Focus on year 2000 specifically
    print("\n" + "-"*70)
    print("YEAR 2000 SPECIFIC METRICS:")
    print("-"*70)
    print(f"Absolute Error: {absolute_errors[0]:,.0f}")
    print(f"Percentage Error: {percentage_errors[0]:.2f}%")
else:
    print("No actual data available for comparison!")


CHRONOS-2 ERROR ANALYSIS

Year 2000:
  Predicted: 160,790,864
  Actual:    154,879,127
  Absolute Error: 5,911,737
  Percentage Error: 3.82%

Year 2001:
  Predicted: 164,576,720
  Actual:    159,270,907
  Absolute Error: 5,305,813
  Percentage Error: 3.33%

Year 2002:
  Predicted: 168,842,576
  Actual:    163,222,549
  Absolute Error: 5,620,027
  Percentage Error: 3.44%

Year 2003:
  Predicted: 173,998,560
  Actual:    167,110,248
  Absolute Error: 6,888,312
  Percentage Error: 4.12%

Year 2004:
  Predicted: 177,948,064
  Actual:    171,286,000
  Absolute Error: 6,662,064
  Percentage Error: 3.89%

SUMMARY METRICS
Mean Absolute Error (MAE): 6,077,591
Mean Absolute Percentage Error (MAPE): 3.72%
Root Mean Squared Error (RMSE): 6,107,650

----------------------------------------------------------------------
YEAR 2000 SPECIFIC METRICS:
----------------------------------------------------------------------
Absolute Error: 5,911,737
Percentage Error: 3.82%


Using Amazon Chronos T5 Tiny to predict the population of Pakistan for the year 2000.

In [43]:

train_df = chronos_df[chronos_df["timestamp"].dt.year <= 1999].copy()

train_df["timestamp"] = pd.to_datetime(train_df["timestamp"])
train_df = train_df.sort_values("timestamp")

series = train_df[train_df["item_id"] == "Pakistan"]["target"].values

# Convert to PyTorch tensor
context = torch.tensor(series, dtype=torch.float32)

pipeline = ChronosPipeline.from_pretrained(
    "amazon/chronos-t5-tiny",
    device_map="cuda",
    torch_dtype=torch.float32,
)

In [44]:

prediction_length = 5  # predict for 2000–2004
num_samples = 20

t5_pred = pipeline.predict(
    context.unsqueeze(0),
    prediction_length=prediction_length,
    num_samples=num_samples,
)


In [48]:

# Extract median predictions → ensure it's a clean 1D numpy array
median_pred = t5_pred.median(dim=0).values.squeeze()  # (prediction_length,)
if median_pred.ndim > 1:
    median_pred = median_pred.squeeze()
predictions = median_pred.cpu().numpy().flatten()  # ← shape: (5,)


# Extract actuals as Python scalars
actual_years = sorted(test_df['timestamp'].dt.year.unique())[:5]
actual_values = []

for year in actual_years:
    subset = test_df[test_df['timestamp'].dt.year == year]['target']
    if len(subset) > 0:
        actual_values.append(subset.iloc[0].item())  # ← scalar
    else:
        actual_values.append(None)

# Loop: Use .item() to get Python scalars
absolute_errors = []
percentage_errors = []

for year, pred_val, actual_val in zip(actual_years, predictions, actual_values):
    if actual_val is not None:
        pred_val = pred_val.item()
        actual_val = float(actual_val)

        abs_error = abs(pred_val - actual_val)
        pct_error = (abs_error / actual_val) * 100 if actual_val != 0 else float('inf')

        absolute_errors.append(abs_error)
        percentage_errors.append(pct_error)

        print(f"\nYear {year}:")
        print(f"  Predicted: {pred_val:,.0f}")
        print(f"  Actual:    {actual_val:,.0f}")
        print(f"  Absolute Error: {abs_error:,.0f}")
        print(f"  Percentage Error: {pct_error:.2f}%")

# SUMMARY METRICS
print("\n" + "="*70)
print("SUMMARY METRICS")
print("="*70)

if len(absolute_errors) > 0:
    mae = np.mean(absolute_errors)  # Mean Absolute Error
    mape = np.mean(percentage_errors)  # Mean Absolute Percentage Error
    rmse = np.sqrt(np.mean([e**2 for e in absolute_errors]))  # Root Mean Squared Error

    print(f"Mean Absolute Error (MAE): {mae:,.0f}")
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
    print(f"Root Mean Squared Error (RMSE): {rmse:,.0f}")

    # Focus on year 2000 specifically
    print("\n" + "-"*70)
    print("YEAR 2000 SPECIFIC METRICS:")
    print("-"*70)
    print(f"Absolute Error: {absolute_errors[0]:,.0f}")
    print(f"Percentage Error: {percentage_errors[0]:.2f}%")
else:
    print("No actual data available for comparison!")


Year 2000:
  Predicted: 153,626,896
  Actual:    154,879,127
  Absolute Error: 1,252,231
  Percentage Error: 0.81%

Year 2001:
  Predicted: 155,555,264
  Actual:    159,270,907
  Absolute Error: 3,715,643
  Percentage Error: 2.33%

Year 2002:
  Predicted: 158,126,432
  Actual:    163,222,549
  Absolute Error: 5,096,117
  Percentage Error: 3.12%

Year 2003:
  Predicted: 160,054,816
  Actual:    167,110,248
  Absolute Error: 7,055,432
  Percentage Error: 4.22%

Year 2004:
  Predicted: 161,983,184
  Actual:    171,286,000
  Absolute Error: 9,302,816
  Percentage Error: 5.43%

SUMMARY METRICS
Mean Absolute Error (MAE): 5,284,448
Mean Absolute Percentage Error (MAPE): 3.18%
Root Mean Squared Error (RMSE): 5,960,973

----------------------------------------------------------------------
YEAR 2000 SPECIFIC METRICS:
----------------------------------------------------------------------
Absolute Error: 1,252,231
Percentage Error: 0.81%


Chronos-t5-tiny is getting a better result for the year 2000. We have a predicted value 153,626,896 while the actual value was 154,879,127. The percentage Error we are getting here is 0.81%.


Chronos-2 is giving us 160,790,864 as the predicted value and a percentage error of 3.82%.



In [49]:
from sklearn.linear_model import LinearRegression
import numpy as np

df2 = pd.read_csv('/content/Population.csv')

# Filter for Pakistan
new_df = chronos_df[chronos_df["item_id"] == "Pakistan"].copy()
new_df["year"] = new_df["timestamp"].dt.year

# Split: train on data up to 1999
train = new_df[new_df["year"] <= 1999]

print(f"Training data: {len(train)} points from {train['year'].min()} to {train['year'].max()}")

# Fit Linear Regression
X_train = train["year"].values.reshape(-1, 1)
y_train = train["target"].values
model = LinearRegression().fit(X_train, y_train)

# Predict for 2000
pred_2000 = model.predict(np.array([[2000]]))

print("\n" + "="*70)
print("LINEAR REGRESSION PREDICTION FOR YEAR 2000")
print("="*70)
print(f"Predicted population: {pred_2000[0]:,.0f}")
print(f"Growth rate: {model.coef_[0]:,.0f} people/year")

actual_data_2000 = new_df[new_df["year"] == 2000]
if len(actual_data_2000) > 0:
    actual = actual_data_2000["target"].values[0]
    error = abs(pred_2000[0] - actual)
    error_pct = (error / actual) * 100
    print(f"\nActual population in 2000: {actual:,.0f}")
    print(f"Absolute error: {error:,.0f}")
    print(f"Percentage error: {error_pct:.2f}%")

Training data: 40 points from 1960 to 1999

LINEAR REGRESSION PREDICTION FOR YEAR 2000
Predicted population: 143,352,770
Growth rate: 2,715,910 people/year

Actual population in 2000: 154,879,127
Absolute error: 11,526,357
Percentage error: 7.44%


Predection using Linear Regression gives us a 7.44% percentage error which is higher than the one we are getting from chronos-t5-tiny. Hence from the three examples chronos-t5-tiny has performed the best forcasting on our population dataset.