### Import Libaries

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from datetime import datetime, timedelta
import time
import sys

import openmeteo_requests
import requests_cache
from retry_requests import retry

### Load Dataset

In [None]:
dataset = pd.read_csv('./Canada_Pedestrian_Counts_Avg_2019_2025_LLM.csv')[0:410] # trimmming by row as Open meteo only goes up to 2025-11-16 for histroical weather data
dataset.head()

Unnamed: 0,Date,City,Pedestrian_Avg,Real_Data_Proportion,Count_Type,Latitude,Longitude
0,2019-01-01,Toronto,86845.0,89.19,both,43.65107,-79.347015
1,2019-01-01,Montreal,122894.0,96.74,real,45.50169,-73.567253
2,2019-01-01,Vancouver,65470.0,74.44,both,49.28273,-123.120735
3,2019-01-01,Edmonton,39321.0,71.31,both,53.54613,-113.493823
4,2019-01-01,Calgary,50365.0,96.24,real,51.04473,-114.071883


In [3]:
dataset.describe()

Unnamed: 0,Pedestrian_Avg,Real_Data_Proportion,Latitude,Longitude
count,410.0,410.0,410.0,410.0
mean,78849.939024,79.947098,48.60527,-100.720142
std,37066.049106,11.960652,3.60991,20.210362
min,26774.0,60.04,43.65107,-123.120735
25%,50262.5,69.235,45.50169,-114.071883
50%,67038.0,80.345,49.28273,-113.493823
75%,102698.0,89.935,51.04473,-79.347015
max,182352.0,99.99,53.54613,-73.567253


In [4]:
df = dataset.copy()

In [5]:
LocID = {'Toronto':'CAD_1','Montreal':'CAD_2','Vancouver':'CAD_3','Edmonton':'CAD_4','Calgary':'CAD_5'}
df.insert(2,'Loc_ID',df['City'].map(LocID))
df

Unnamed: 0,Date,City,Loc_ID,Pedestrian_Avg,Real_Data_Proportion,Count_Type,Latitude,Longitude
0,2019-01-01,Toronto,CAD_1,86845.0,89.19,both,43.65107,-79.347015
1,2019-01-01,Montreal,CAD_2,122894.0,96.74,real,45.50169,-73.567253
2,2019-01-01,Vancouver,CAD_3,65470.0,74.44,both,49.28273,-123.120735
3,2019-01-01,Edmonton,CAD_4,39321.0,71.31,both,53.54613,-113.493823
4,2019-01-01,Calgary,CAD_5,50365.0,96.24,real,51.04473,-114.071883
...,...,...,...,...,...,...,...,...
405,2025-10-01,Montreal,CAD_2,125879.0,88.42,both,45.50169,-73.567253
406,2025-10-01,Vancouver,CAD_3,86646.0,80.36,both,49.28273,-123.120735
407,2025-10-01,Edmonton,CAD_4,46652.0,72.34,both,53.54613,-113.493823
408,2025-10-01,Toronto,CAD_1,182352.0,66.61,synthetic,43.65107,-79.347015


### Getting Past Weather Data Month By Month Per Location

In [6]:
CITY_COORDS = {
    "Toronto":  (43.65, -79.38),
    "Vancouver":(49.28, -123.12),
    "Edmonton": (53.55, -113.49),
    "Calgary":  (51.05, -114.07),
    "Montreal": (45.50, -73.57),
}

# Setup the Open-Meteo API client with cache and retry on error
cache_sess=requests_cache.CachedSession('.cache',expire_after=3600)
rtry_sess=retry(cache_sess,retries=5,backoff_factor=0.2)
openmeteo=openmeteo_requests.Client(session=rtry_sess)

def fetch_open_meteo_weather(start_date, end_date, lat, long):
    url = "https://historical-forecast-api.open-meteo.com/v1/forecast"
    params = {
        "latitude": lat,
        "longitude": long,
        "start_date": start_date,
        "end_date": end_date,
        "hourly": ["temperature_2m", "precipitation", "wind_speed_10m"],
    }
    
    hly = openmeteo.weather_api(url,params=params)[0].Hourly()
    
    Tavg = hly.Variables(0).ValuesAsNumpy().mean()
    Pavg = hly.Variables(1).ValuesAsNumpy().mean()
    Wavg = hly.Variables(2).ValuesAsNumpy().mean()

    return [round(float(Tavg),3),round(float(Pavg),3),round(float(Wavg),3)]

In [7]:
df2 = df.copy()

In [8]:
for i,r in df.iterrows():
    print(f"{i+1}//{len(df2)}", end='\r')
    lat = r['Latitude']
    long = r['Longitude']
    stDt = r['Date']
    edDt =  str(datetime.strptime(stDt, "%Y-%m-%d").date() + timedelta(days=22))
    res = fetch_open_meteo_weather(stDt,edDt,lat,long)
    df2.loc[i,'Weather_Temperature_Avg'] = res[0]
    df2.loc[i,'Weather_Precipitation_Avg'] = res[1]
    df2.loc[i,'Weather_Wind_Speed_Avg'] = res[2]
    sys.stdout.flush()
    time.sleep(0.1)
df2.head()

410//410

Unnamed: 0,Date,City,Loc_ID,Pedestrian_Avg,Real_Data_Proportion,Count_Type,Latitude,Longitude,Weather_Temperature_Avg,Weather_Precipitation_Avg,Weather_Wind_Speed_Avg
0,2019-01-01,Toronto,CAD_1,86845.0,89.19,both,43.65107,-79.347015,-4.407,0.114,15.745
1,2019-01-01,Montreal,CAD_2,122894.0,96.74,real,45.50169,-73.567253,-10.025,0.184,11.489
2,2019-01-01,Vancouver,CAD_3,65470.0,74.44,both,49.28273,-123.120735,5.477,0.338,10.35
3,2019-01-01,Edmonton,CAD_4,39321.0,71.31,both,53.54613,-113.493823,,,
4,2019-01-01,Calgary,CAD_5,50365.0,96.24,real,51.04473,-114.071883,-4.135,0.017,8.721


In [9]:
df2[df2.isna().any(axis=1)].shape

(27, 11)

In [10]:
df2_ff = df2.ffill()

In [11]:
df2_ff[df2_ff.isna().any(axis=1)].shape

(0, 11)

In [12]:
df2_ff[df2_ff.isna().any(axis=1)]

Unnamed: 0,Date,City,Loc_ID,Pedestrian_Avg,Real_Data_Proportion,Count_Type,Latitude,Longitude,Weather_Temperature_Avg,Weather_Precipitation_Avg,Weather_Wind_Speed_Avg


### Create the Season & Month columns

In [13]:
df3 = df2_ff.copy()
df3['Date'] = pd.to_datetime(df3['Date'])

In [14]:
seasons = {0:'Winter',1:'Spring',2:'Summer',3:'Fall'}
month = {1:'Janary',2:'February',3:'March',4:'Apirl',5:'May',6:'June',7:'July',8:'August',9:'Setember',10:'October',11:'November',12:'December'}
df3['Month'] = df3['Date'].dt.month
for i,r in df3.iterrows():
    df3.loc[i,'Season'] = seasons.get(int(r['Month']/4))
df3['Month'] = df3['Month'].apply(lambda x: month.get(x))
df3.head()

Unnamed: 0,Date,City,Loc_ID,Pedestrian_Avg,Real_Data_Proportion,Count_Type,Latitude,Longitude,Weather_Temperature_Avg,Weather_Precipitation_Avg,Weather_Wind_Speed_Avg,Month,Season
0,2019-01-01,Toronto,CAD_1,86845.0,89.19,both,43.65107,-79.347015,-4.407,0.114,15.745,Janary,Winter
1,2019-01-01,Montreal,CAD_2,122894.0,96.74,real,45.50169,-73.567253,-10.025,0.184,11.489,Janary,Winter
2,2019-01-01,Vancouver,CAD_3,65470.0,74.44,both,49.28273,-123.120735,5.477,0.338,10.35,Janary,Winter
3,2019-01-01,Edmonton,CAD_4,39321.0,71.31,both,53.54613,-113.493823,5.477,0.338,10.35,Janary,Winter
4,2019-01-01,Calgary,CAD_5,50365.0,96.24,real,51.04473,-114.071883,-4.135,0.017,8.721,Janary,Winter


### Cacluate the Attraction Score & Tourist Saturation Level

In [18]:
df4 = df3.copy()

In [21]:
# ns = np.random.default_rng(42)

# # Base attractiveness by city
# citys = {"Toronto": 0.8,"Vancouver": 0.7,"Edmonton": 0.5,"Calgary": 0.6,"Montreal": 0.75}

# adjseason = {"Winter": -0.10,"Spring": 0.05,"Summer": 0.15,"Fall": 0.00}

# def attraction(city, season):
#     base = citys.get(city, 0.60) + adjseason.get(season, 0)
#     noise = ns.normal(0, 0.05)
#     return round(np.clip(base + noise, 0.0, 1.0), 3)


# df4["Attraction_Score"] = df4.apply( lambda x: attraction(x["City"], x["Season"]), axis=1)

# # Normalize pedestrian average within each city
# df4["P_N"] = df4.groupby("City")["Pedestrian_Avg"].transform( lambda s: (s - s.min()) / (s.max() - s.min() + 1e-9))
# df4["Tourist_Saturation_Level"] = np.clip( df4["P_N"] * 0.8 + ns.normal(0, 0.05, len(df4)) + 0.1, 0.0, 1.0).round(3)
# df4.drop(columns=["P_N"], inplace=True)
# df4.head()

# Base attraction score by city (0-1 scale)
city_score = {"Toronto": 0.8,"Vancouver": 0.7,"Montreal": 0.85,"Calgary": 0.6,"Edmonton": 0.55}

# Map each city’s base score
df4["Attraction_Score"] = df4["City"].map(city_score)

# Add small random variation (so it's not static)
df4["Attraction_Score"] = (df4["Attraction_Score"] + np.random.uniform(-0.05, 0.05, len(df4))).clip(0, 1).round(3)

# Tourist_Saturation_Level = normalized Pedestrian_Avg (0–1)
df4["Tourist_Saturation_Level"] = df4.groupby("City")["Pedestrian_Avg"].transform(lambda s: (s - s.min()) / (s.max() - s.min())).round(3)
df4.head()

Unnamed: 0,Date,City,Loc_ID,Pedestrian_Avg,Real_Data_Proportion,Count_Type,Latitude,Longitude,Weather_Temperature_Avg,Weather_Precipitation_Avg,Weather_Wind_Speed_Avg,Month,Season,Attraction_Score,Tourist_Saturation_Level
0,2019-01-01,Toronto,CAD_1,86845.0,89.19,both,43.65107,-79.347015,-4.407,0.114,15.745,Janary,Winter,0.847,0.009
1,2019-01-01,Montreal,CAD_2,122894.0,96.74,real,45.50169,-73.567253,-10.025,0.184,11.489,Janary,Winter,0.809,0.861
2,2019-01-01,Vancouver,CAD_3,65470.0,74.44,both,49.28273,-123.120735,5.477,0.338,10.35,Janary,Winter,0.676,0.472
3,2019-01-01,Edmonton,CAD_4,39321.0,71.31,both,53.54613,-113.493823,5.477,0.338,10.35,Janary,Winter,0.565,0.317
4,2019-01-01,Calgary,CAD_5,50365.0,96.24,real,51.04473,-114.071883,-4.135,0.017,8.721,Janary,Winter,0.636,0.471


In [22]:
df4.describe()

Unnamed: 0,Date,Pedestrian_Avg,Real_Data_Proportion,Latitude,Longitude,Weather_Temperature_Avg,Weather_Precipitation_Avg,Weather_Wind_Speed_Avg,Attraction_Score,Tourist_Saturation_Level
count,410,410.0,410.0,410.0,410.0,410.0,410.0,410.0,410.0,410.0
mean,2022-05-17 03:30:43.902438912,78849.939024,79.947098,48.60527,-100.720142,7.793307,0.080571,10.701122,0.699922,0.496568
min,2019-01-01 00:00:00,26774.0,60.04,43.65107,-123.120735,-18.425,0.0,5.341,0.503,0.0
25%,2020-09-01 00:00:00,50262.5,69.235,45.50169,-114.071883,0.439,0.024,8.6355,0.585,0.24475
50%,2022-05-16 12:00:00,67038.0,80.345,49.28273,-113.493823,8.348,0.051,10.144,0.699,0.489
75%,2024-02-01 00:00:00,102698.0,89.935,51.04473,-79.347015,16.134,0.112,12.26175,0.812,0.74575
max,2025-10-01 00:00:00,182352.0,99.99,53.54613,-73.567253,23.931,0.712,19.014,0.9,1.0
std,,37066.049106,11.960652,3.60991,20.210362,9.619658,0.086898,2.768255,0.118807,0.29756


### Save new dataset

In [23]:
output_path = "./Canada_Full_Pedestrian_Counts_Avg_2019_2025.csv"
df4.to_csv(output_path, index=False)