In [None]:
from scraper import get_museum_data
import pandas as pd
import os
from pathlib import Path

museum_df = get_museum_data()

print("Museum Data")
print(museum_df.to_string(max_rows=5))

In [None]:
# Dataset downloaded from https://www.kaggle.com/datasets/dataanalyst001/world-population-growth-rate-by-cities-2024
# Cached locally since it is very small
city_df = pd.read_csv(os.path.abspath('../data/population_data.csv'))
print("City Data")
print(city_df.to_string(max_rows=5))

### Here we assume all museum data is from 2023 and will try to predict the visitors for 2024
The wikipedia table has visitor data from both 2024 and 2023, however I was only able to find population growth values for 2023-2024. 
Therefore in order to simplify the model training, I am treating all the museum data as being from 2023, and using population data from
2023 and 2024. 

In [None]:
joined_data = museum_df.merge(city_df, left_on='city', right_on='City')
joined_data = joined_data[['name', 'type', 'collection_size', 'visitors', 'city', 'Population_2024', 'Population_2023', 'Growth Rate']]


# Some cleaning of the data to prepare for model training
mean_items = joined_data['collection_size'].mean()
joined_data['collection_size'] = joined_data['collection_size'].fillna(mean_items)
joined_data['collection_size'] = joined_data['collection_size'].astype('int64')

mode_type = joined_data['type'].mode()
joined_data['type'] = joined_data['type'].fillna(mode_type.iloc[0])

print(joined_data.to_string(max_rows=5))

### Here we begin preparing the data for model training
A key assumption/simplification made is that the real 2024 visitor values are just the 2023 values multiplied by the city growth rate.
This estimated 2024 value is used to validate the model predicted output.

In [None]:
from sklearn.preprocessing import LabelEncoder

# Creating data for 2024 by multiplying visitors by the growth rate of the city
joined_data["visitors_2024"] = joined_data["visitors"] * (1 + joined_data["Growth Rate"])
joined_data["visitors_2024"] = joined_data["visitors_2024"].round().astype('int64')

# Encoding features, but keeping a copy of the df for validation later
encoder = LabelEncoder()
data_copy = joined_data.copy()
joined_data['type'] = encoder.fit_transform(joined_data['type'])
joined_data['city'] = encoder.fit_transform(joined_data['city'])

print(joined_data.to_string(max_rows=5))

In [None]:
from sklearn.model_selection import train_test_split

# Splitting features (X) and target variable (Y)
# The museum name is used solely for identification so we exclude it from encoding.
X = joined_data.drop(columns=['name', 'visitors_2024'], axis=1)
Y = joined_data['visitors_2024']

# Splitting the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=2)

In [None]:
from xgboost import XGBRegressor
from sklearn import metrics

# Training model
regressor = XGBRegressor()
regressor.fit(X_train, Y_train)

In [None]:
# Testing model on training data
training_data_prediction = regressor.predict(X_train)
r2_train = metrics.r2_score(Y_train, training_data_prediction)
print('R Squared (Training Data) = ', r2_train)

In [None]:
# Evaluate model on test data
test_data_prediction = regressor.predict(X_test)
r2_test = metrics.r2_score(Y_test, test_data_prediction)
print('R Squared (Test Data) = ', r2_test)

In [None]:
# Predicting values using full dataset
prediction = regressor.predict(X)
data_copy['predicted_2024'] = prediction
data_copy['delta'] = data_copy['predicted_2024'] - data_copy['visitors_2024']
data_copy = data_copy[['name', 'city', 'Growth Rate', 'visitors', 'visitors_2024', 'predicted_2024', 'delta']]
print(data_copy.to_string())