In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from dateutil import parser
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### Load in Data sets and combine, change date to datetime.date()

In [2]:
Auck_peds = pd.read_csv('data_weather/Final/Auckland_Pedestrian_daily.csv')
Dub_peds = pd.read_csv('data_weather/Final/Dublin_Pedestrian_daily.csv')

df = pd.concat([Auck_peds,Dub_peds],ignore_index=True,axis='index')
df['Date'] = df['Date'].apply(lambda x: parser.parse(x))

display(df.head(2))
display(df.tail(2))

Unnamed: 0,Country,City,Location_ID,Location_Name,Type_of_Attraction,Attraction_Category,Latitude,Longitude,Date,PedsSen_Count,Weather_Temperature,Weather_Wind_Gust,Weather_Relative_Humidity,Weather_Precipitation,Is_Holiday
0,New Zealand,Auckland,NZAUK_1,Sky Tower,Observation Tower,Entertainment,-36.8485,174.7633,2021-01-01,36620.0,18.858334,18.749998,78.87771,0.7,1
1,New Zealand,Auckland,NZAUK_1,Sky Tower,Observation Tower,Entertainment,-36.8485,174.7633,2021-01-02,35415.0,18.966665,19.724998,86.536705,15.800002,1


Unnamed: 0,Country,City,Location_ID,Location_Name,Type_of_Attraction,Attraction_Category,Latitude,Longitude,Date,PedsSen_Count,Weather_Temperature,Weather_Wind_Gust,Weather_Relative_Humidity,Weather_Precipitation,Is_Holiday
16432,Ireland,Dublin,IRDUB_8,EPIC The Irish Emigration Museum,Museum,Cultural,53.3471,-6.2416,2025-12-30,263127.0,6.845833,27.434998,73.873955,0.3,0
16433,Ireland,Dublin,IRDUB_8,EPIC The Irish Emigration Museum,Museum,Cultural,53.3471,-6.2416,2025-12-31,264930.0,3.614583,19.86,78.457085,0.0,0


### Performing Label Cyclical Encoding 

In [3]:
# Encode categorical columns (except target)
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    if col != 'Location_ID':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

df['Month_Sin'] = np.sin(2 * np.pi * df['Date'].dt.month / 12)
df['Month_Cos'] = np.cos(2 * np.pi * df['Date'].dt.month / 12)
df['Day_Sin']   = np.sin(2 * np.pi * df['Date'].dt.dayofweek / 7)
df['Day_Cos']   = np.cos(2 * np.pi * df['Date'].dt.dayofweek / 7)

display(df.head(10))

Unnamed: 0,Country,City,Location_ID,Location_Name,Type_of_Attraction,Attraction_Category,Latitude,Longitude,Date,PedsSen_Count,Weather_Temperature,Weather_Wind_Gust,Weather_Relative_Humidity,Weather_Precipitation,Is_Holiday,Month_Sin,Month_Cos,Day_Sin,Day_Cos
0,1,0,NZAUK_1,5,4,1,-36.8485,174.7633,2021-01-01,36620.0,18.858334,18.749998,78.87771,0.7,1,0.5,0.866025,-0.433884,-0.900969
1,1,0,NZAUK_1,5,4,1,-36.8485,174.7633,2021-01-02,35415.0,18.966665,19.724998,86.536705,15.800002,1,0.5,0.866025,-0.974928,-0.222521
2,1,0,NZAUK_1,5,4,1,-36.8485,174.7633,2021-01-03,37663.0,18.820837,23.085,87.366516,19.7,0,0.5,0.866025,-0.781831,0.62349
3,1,0,NZAUK_1,5,4,1,-36.8485,174.7633,2021-01-04,41416.0,19.887499,20.654999,80.15483,1.2,1,0.5,0.866025,0.0,1.0
4,1,0,NZAUK_1,5,4,1,-36.8485,174.7633,2021-01-05,55131.0,20.666666,14.73,81.36724,0.5,0,0.5,0.866025,0.781831,0.62349
5,1,0,NZAUK_1,5,4,1,-36.8485,174.7633,2021-01-06,53695.0,20.729166,25.769999,76.58532,0.0,0,0.5,0.866025,0.974928,-0.222521
6,1,0,NZAUK_1,5,4,1,-36.8485,174.7633,2021-01-07,58715.0,19.99375,19.664999,83.3534,3.9,0,0.5,0.866025,0.433884,-0.900969
7,1,0,NZAUK_1,5,4,1,-36.8485,174.7633,2021-01-08,61342.0,19.779167,22.619999,82.93931,20.5,0,0.5,0.866025,-0.433884,-0.900969
8,1,0,NZAUK_1,5,4,1,-36.8485,174.7633,2021-01-09,51377.0,19.843748,16.949997,81.506996,1.8,0,0.5,0.866025,-0.974928,-0.222521
9,1,0,NZAUK_1,5,4,1,-36.8485,174.7633,2021-01-10,38325.0,20.108334,28.859999,76.20948,0.3,0,0.5,0.866025,-0.781831,0.62349


### Feature Selection

In [4]:
cols_to_use = [
 'PedsSen_Count',
 'Weather_Temperature',
 'Weather_Wind_Gust',
 'Weather_Relative_Humidity',
 'Weather_Precipitation',
 'Month_Sin',
 'Month_Cos',
 'Day_Sin',
 'Day_Cos',
 'Latitude',
 'Longitude','Attraction_Category'
]

In [5]:
# Define features and target , metric='minkows', p=3
X = df.drop(columns=['Location_ID',
                     'Country',
                     'City','Is_Holiday',
                     'Type_of_Attraction',
                     'Location_Name',
                     'Date'])
y = df['Location_ID']
display(X.head(0))
display(y.head(0))

Unnamed: 0,Attraction_Category,Latitude,Longitude,PedsSen_Count,Weather_Temperature,Weather_Wind_Gust,Weather_Relative_Humidity,Weather_Precipitation,Month_Sin,Month_Cos,Day_Sin,Day_Cos


Series([], Name: Location_ID, dtype: object)

### Model training

In [6]:
# Split train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train KNN Classifier
knn = KNeighborsClassifier(n_neighbors=16, metric='minkowski', p=1)
knn.fit(X_train, y_train)

0,1,2
,n_neighbors,16
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,1
,metric,'minkowski'
,metric_params,
,n_jobs,


#### Evaluating Model 

In [7]:
# Evaluate
y_pred = knn.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

     IRDUB_1       0.73      0.82      0.77       365
     IRDUB_3       1.00      1.00      1.00       365
     IRDUB_4       1.00      1.00      1.00       365
     IRDUB_5       0.84      0.92      0.88       366
     IRDUB_8       0.96      0.75      0.84       365
     NZAUK_1       0.64      0.79      0.70       365
     NZAUK_5       0.89      0.71      0.79       366
     NZAUK_6       1.00      1.00      1.00       365
     NZAUK_7       0.82      0.80      0.81       365

    accuracy                           0.86      3287
   macro avg       0.88      0.86      0.87      3287
weighted avg       0.88      0.86      0.87      3287


Confusion Matrix:
 [[299   0   0  54  12   0   0   0   0]
 [  0 365   0   0   0   0   0   0   0]
 [  0   1 364   0   0   0   0   0   0]
 [ 29   0   0 337   0   0   0   0   0]
 [ 80   0   0  12 273   0   0   0   0]
 [  0   0   0   0   0 287  32   0  46]
 [  0   0   0   

#### Using a test case

In [8]:
yPD,yPI = knn.kneighbors([[7764.83,14.5,43.4,2.34,62.34,0.5,0.866025,-0.974928,-0.222521,-36.8485,174.7633,3]])
Found = pd.DataFrame(columns=df.columns)

In [9]:
loc_id = "IRDUB_1" # This was a location to be displayed to user

In [10]:
yPD

array([[8097.73410212, 8097.84008828, 8098.18025822, 8098.48305948,
        8098.50505019, 8098.57721483, 8099.05846688, 8099.10025926,
        8099.13858589, 8099.14303909, 8099.20308349, 8099.35697382,
        8099.37226825, 8099.37988918, 8099.38192169, 8099.42599929]])

In [11]:
yPI

array([[ 5381,   381,  3370, 12732, 11491,  5095,  9941,  3235,  9911,
         8767, 10671,  8247,  6015,   515, 12438,  7258]], dtype=int64)

In [12]:
for i in range(len(yPI[0])):
    idx = yPI[0,i]
    if df['Location_ID'].loc[idx] != loc_id:
        Found.loc[len(Found)] = df.loc[idx]
    
# Keep the row found with lowest crowd
Found = Found.sort_values(by=['PedsSen_Count']).reset_index(drop=True)

In [13]:
display(Found.head(1))

Unnamed: 0,Country,City,Location_ID,Location_Name,Type_of_Attraction,Attraction_Category,Latitude,Longitude,Date,PedsSen_Count,Weather_Temperature,Weather_Wind_Gust,Weather_Relative_Humidity,Weather_Precipitation,Is_Holiday,Month_Sin,Month_Cos,Day_Sin,Day_Cos
0,1,0,NZAUK_7,3,0,1,-36.8531,174.8506,2025-11-16 00:00:00,13513.0,15.405086,18.824999,67.28699,0.0,0,-0.5,0.866025,-0.781831,0.62349


In [14]:
Found.loc[0]

Country                                        1
City                                           0
Location_ID                              NZAUK_7
Location_Name                                  3
Type_of_Attraction                             0
Attraction_Category                            1
Latitude                                -36.8531
Longitude                               174.8506
Date                         2025-11-16 00:00:00
PedsSen_Count                            13513.0
Weather_Temperature                    15.405086
Weather_Wind_Gust                      18.824999
Weather_Relative_Humidity               67.28699
Weather_Precipitation                        0.0
Is_Holiday                                     0
Month_Sin                                   -0.5
Month_Cos                               0.866025
Day_Sin                                -0.781831
Day_Cos                                  0.62349
Name: 0, dtype: object

### Output Model as pickel

In [15]:
import pickle
import os

In [16]:
os.makedirs("knn_model", exist_ok=True) 

model_path = f"knn_model/loc_knn.pkl"
with open(model_path, "wb") as f:
        pickle.dump(knn, f)