In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from dateutil import parser
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### Load in Data sets and combine, change date to datetime.date()

In [2]:
Auck_peds = pd.read_csv('data_weather/Final/Auckland_Pedestrian_Hourly.csv')
Dub_peds = pd.read_csv('data_weather/Final/Dublin_Pedestrian_Hourly.csv')

df = pd.concat([Auck_peds,Dub_peds],ignore_index=True,axis='index')
df['Date'] = df['Date'].apply(lambda x: parser.parse(x))

display(df.head(2))
display(df.tail(2))

Unnamed: 0,Country,City,Location_ID,Location_Name,Type_of_Attraction,Attraction_Category,Latitude,Longitude,Date,Avg_Daily_Pedestrian_Count,Holiday,Weather_Temperature_Avg,Weather_Wind_Speed_Avg,Weather_Precipitation_Sum,Weather_Relative_Humidity_Avg
0,New Zealand,Auckland,NZAUK_1,Sky Tower,Tower,Urban Landmark,-36.8485,174.7633,2021-01-01,2686.0,1.0,19.225752,19.874998,14.5,83.94776
1,New Zealand,Auckland,NZAUK_1,Sky Tower,Tower,Urban Landmark,-36.8485,174.7633,2021-01-02,2964.0,1.0,18.509085,21.929998,15.800001,88.19395


Unnamed: 0,Country,City,Location_ID,Location_Name,Type_of_Attraction,Attraction_Category,Latitude,Longitude,Date,Avg_Daily_Pedestrian_Count,Holiday,Weather_Temperature_Avg,Weather_Wind_Speed_Avg,Weather_Precipitation_Sum,Weather_Relative_Humidity_Avg
13350,Ireland,Dublin,IRDUB_5,Dublin Castle,Historic Site,Culture & History,53.3429,-6.2675,2025-09-05,4096.0,0.0,15.035167,28.800001,0.0,78.31516
13351,Ireland,Dublin,IRDUB_5,Dublin Castle,Historic Site,Culture & History,53.3429,-6.2675,2025-09-06,3496.0,0.0,17.330997,44.94,2.8,78.5794


### Performing Label Cyclical Encoding 

In [3]:
# Encode categorical columns (except target)
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    if col != 'Location_ID':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

df['Month_Sin'] = np.sin(2 * np.pi * df['Date'].dt.month / 12)
df['Month_Cos'] = np.cos(2 * np.pi * df['Date'].dt.month / 12)
df['Day_Sin']   = np.sin(2 * np.pi * df['Date'].dt.dayofweek / 7)
df['Day_Cos']   = np.cos(2 * np.pi * df['Date'].dt.dayofweek / 7)

display(df.head(10))

Unnamed: 0,Country,City,Location_ID,Location_Name,Type_of_Attraction,Attraction_Category,Latitude,Longitude,Date,Avg_Daily_Pedestrian_Count,Holiday,Weather_Temperature_Avg,Weather_Wind_Speed_Avg,Weather_Precipitation_Sum,Weather_Relative_Humidity_Avg,Month_Sin,Month_Cos,Day_Sin,Day_Cos
0,1,0,NZAUK_1,2,5,3,-36.8485,174.7633,2021-01-01,2686.0,1.0,19.225752,19.874998,14.5,83.94776,0.5,0.866025,-0.433884,-0.900969
1,1,0,NZAUK_1,2,5,3,-36.8485,174.7633,2021-01-02,2964.0,1.0,18.509085,21.929998,15.800001,88.19395,0.5,0.866025,-0.974928,-0.222521
2,1,0,NZAUK_1,2,5,3,-36.8485,174.7633,2021-01-03,2918.0,0.0,19.51325,21.960001,7.0,80.49761,0.5,0.866025,-0.781831,0.62349
3,1,0,NZAUK_1,2,5,3,-36.8485,174.7633,2021-01-04,3210.0,1.0,20.307,14.864999,0.5,82.51716,0.5,0.866025,0.0,1.0
4,1,0,NZAUK_1,2,5,3,-36.8485,174.7633,2021-01-05,4083.0,0.0,20.627832,22.559998,0.0,78.30032,0.5,0.866025,0.781831,0.62349
5,1,0,NZAUK_1,2,5,3,-36.8485,174.7633,2021-01-06,4428.0,0.0,20.048666,20.37,1.1,80.82641,0.5,0.866025,0.974928,-0.222521
6,1,0,NZAUK_1,2,5,3,-36.8485,174.7633,2021-01-07,3950.0,0.0,19.590334,24.135002,22.500002,83.33901,0.5,0.866025,0.433884,-0.900969
7,1,0,NZAUK_1,2,5,3,-36.8485,174.7633,2021-01-08,4612.0,0.0,19.602835,14.970001,2.6,81.65501,0.5,0.866025,-0.433884,-0.900969
8,1,0,NZAUK_1,2,5,3,-36.8485,174.7633,2021-01-09,4372.0,0.0,20.163252,27.914999,0.3,78.399345,0.5,0.866025,-0.974928,-0.222521
9,1,0,NZAUK_1,2,5,3,-36.8485,174.7633,2021-01-10,2895.0,0.0,19.282001,34.365,0.3,67.62727,0.5,0.866025,-0.781831,0.62349


### Feature Selection

In [4]:
cols_to_use = [
 'Avg_Daily_Pedestrian_Count',
 'Weather_Temperature_Avg',
 'Weather_Wind_Speed_Avg',
 'Weather_Precipitation_Sum',
 'Weather_Relative_Humidity_Avg',
 'Month_Sin',
 'Month_Cos',
 'Day_Sin',
 'Day_Cos',
 'Latitude',
 'Longitude','Attraction_Category'
]

In [5]:
# Define features and target , metric='minkows', p=3
X = df.drop(columns=['Location_ID',
                     'Country',
                     'City','Holiday',
                     'Type_of_Attraction',
                     'Location_Name',
                     'Date'])
y = df['Location_ID']
display(X.head(0))
display(y.head(0))

Unnamed: 0,Attraction_Category,Latitude,Longitude,Avg_Daily_Pedestrian_Count,Weather_Temperature_Avg,Weather_Wind_Speed_Avg,Weather_Precipitation_Sum,Weather_Relative_Humidity_Avg,Month_Sin,Month_Cos,Day_Sin,Day_Cos


Series([], Name: Location_ID, dtype: object)

### Model training

In [54]:
# Split train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train KNN Classifier
knn = KNeighborsClassifier(n_neighbors=16, metric='minkowski', p=1)
knn.fit(X_train, y_train)

0,1,2
,n_neighbors,16
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,1
,metric,'minkowski'
,metric_params,
,n_jobs,


#### Evaluating Model 

In [None]:
# Evaluate
y_pred = knn.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9183826282291276

Classification Report:
               precision    recall  f1-score   support

     IRDUB_1       0.76      0.62      0.68       321
     IRDUB_3       0.94      1.00      0.97       321
     IRDUB_4       1.00      0.94      0.97       321
     IRDUB_5       0.68      0.80      0.73       321
     NZAUK_1       0.99      0.99      0.99       347
     NZAUK_2       0.98      1.00      0.99       346
     NZAUK_4       1.00      0.98      0.99       347
     NZAUK_5       0.99      0.99      0.99       347

    accuracy                           0.92      2671
   macro avg       0.92      0.92      0.91      2671
weighted avg       0.92      0.92      0.92      2671


Confusion Matrix:
 [[198   0   0 123   0   0   0   0]
 [  0 321   0   0   0   0   0   0]
 [  0  19 302   0   0   0   0   0]
 [ 62   2   0 257   0   0   0   0]
 [  0   0   0   0 345   0   0   2]
 [  0   0   0   0   0 346   0   0]
 [  0   0   0   0   0   7 340   0]
 [  0   0   0   0   2   0   1 

#### Using a test case

In [None]:
yPD,yPI = knn.kneighbors([[7764.83,14.5,43.4,2.34,62.34,0.5,0.866025,-0.974928,-0.222521,-36.8485,174.7633,3]])
Found = pd.DataFrame(columns=df.columns)

In [60]:
loc_id = "IRDUB_1" # This was a location to be displayed to user

In [58]:
yPD

array([[8097.0756107 , 8097.22167334, 8098.25098596, 8098.36653388,
        8098.40184784, 8098.49529865, 8098.54984243, 8098.6124513 ,
        8098.66026437, 8098.66569072, 8098.66929824, 8098.71823375,
        8098.71956711, 8098.75312663, 8098.77275901, 8098.78595479]])

In [59]:
yPI

array([[10201,  5606,  5999,  8506,  8596,  5590, 10069,  2763,  8137,
         8831, 10501,  6957,  3487,  1979,  7233,  1638]], dtype=int64)

In [None]:
for i in range(len(yPI[0])):
    idx = yPI[0,i]
    if df['Location_ID'].loc[idx] != loc_id:
        Found.loc[len(Found)] = df.loc[idx]
    
# Keep the row found with lowest crowd
Found = Found.sort_values(by=['Avg_Daily_Pedestrian_Count']).reset_index(drop=True)

In [68]:
display(Found.head(1))

Unnamed: 0,Country,City,Location_ID,Location_Name,Type_of_Attraction,Attraction_Category,Latitude,Longitude,Date,Avg_Daily_Pedestrian_Count,Holiday,Weather_Temperature_Avg,Weather_Wind_Speed_Avg,Weather_Precipitation_Sum,Weather_Relative_Humidity_Avg,Month_Sin,Month_Cos,Day_Sin,Day_Cos
0,1,0,NZAUK_2,0,3,0,-36.8605,174.7773,2021-09-07 00:00:00,127.0,0.0,11.291749,47.429993,7.1,83.67611,-1.0,-0.0,0.781831,0.62349


In [73]:
Found.loc[0]

Country                                            1
City                                               0
Location_ID                                  NZAUK_2
Location_Name                                      0
Type_of_Attraction                                 3
Attraction_Category                                0
Latitude                                    -36.8605
Longitude                                   174.7773
Date                             2021-09-07 00:00:00
Avg_Daily_Pedestrian_Count                     127.0
Holiday                                          0.0
Weather_Temperature_Avg                    11.291749
Weather_Wind_Speed_Avg                     47.429993
Weather_Precipitation_Sum                        7.1
Weather_Relative_Humidity_Avg               83.67611
Month_Sin                                       -1.0
Month_Cos                                       -0.0
Day_Sin                                     0.781831
Day_Cos                                      0

### Output Model as pickel

In [71]:
import pickle
import os

In [72]:
os.makedirs("knn_model", exist_ok=True) 

model_path = f"knn_model/loc_knn.pkl"
with open(model_path, "wb") as f:
        pickle.dump(knn, f)