In [9]:
import wikipedia
import pandas as pd
import numpy as np
import requests
import time
import pycountry
from fuzzywuzzy import process
from sklearn.linear_model import LinearRegression
from io import StringIO

In [11]:
def retrieve_database(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # raises error if request fails
    html = StringIO(response.text)
    tables = pd.read_html(html)
    return tables[0]  # first table

data = retrieve_database('https://en.wikipedia.org/wiki/List_of_most-visited_museums#Most-visited_museums_in_2024')

In [12]:
def clean_number(val):
    if isinstance(val, str):
        v = val[:9]
        v = v.replace(',', '')
        v = v.lower().strip()
 
        if v.endswith(("mil", "milli", "mill")):
            num = ''.join(c for c in v if c.isdigit() or c == '.')
            return float(num) * 1_000_000

        try:
            return float(v)
        except ValueError:
            return None
    return val

data['Visitors in 2024'] = data['Visitors in 2024'].apply(clean_number)

In [13]:
def clean_city_name(city):
    return city.split(",")[0].strip()

data['City'] = data['City'].apply(clean_city_name)

In [14]:
def country_to_iso2(name):
    
    custom_map = {
    "Turkey": "TR",
    "Vatican": "VA",
    "Russia": "RU",
    "Iran": "IR",
    "South Korea": "KR",
    "North Korea": "KP"}

    country_names = [c.name for c in pycountry.countries]
    
    if pd.isna(name) or not name.strip():
        return None
    name = name.strip()

    if name in custom_map:
        return custom_map[name]
    
    try:
        return pycountry.countries.lookup(name).alpha_2
    except LookupError:
        pass
    
    for c in pycountry.countries:
        if hasattr(c, "official_name") and c.official_name.lower() == name.lower():
            return c.alpha_2

    match, score = process.extractOne(name, country_names)
    if score >= 80:
        try:
            return pycountry.countries.lookup(match).alpha_2
        except LookupError:
            return None
    
    return None

data["Country Code"] = data["Country"].apply(country_to_iso2)

In [15]:
USERNAME = "anna_v"

def get_population(city, country_code=None):
    base_url = "http://api.geonames.org/searchJSON"
    params = {
        "q": city,
        "maxRows": 10,
        "username": USERNAME
    }
    if country_code:
        params["country"] = country_code
    
    try:
        r = requests.get(base_url, params=params, timeout=5)
        r.raise_for_status()
        data = r.json()
        geonames = data.get("geonames", [])
        if geonames:
            best_match = max(geonames, key=lambda x: x.get("population", 0))
            pop = best_match.get("population")
            return int(pop) if pop else None
        else:
            return None
    except Exception as e:
        print(f"Error fetching {city}, {country_code}: {e}")
        return None

populations = []
for _, row in data.iterrows():
    pop = get_population(row["City"], row.get("Country Code"))
    populations.append(pop)
    time.sleep(1)

data["Population"] = populations

In [16]:
data

Unnamed: 0,Name,Visitors in 2024,City,Country,Country Code,Population
0,Louvre,8700000.0,Paris,France,FR,2138551
1,National Museum of China,6956800.0,Beijing,China,CN,18960744
2,Vatican Museums,6825436.0,Vatican City,Vatican,VA,921
3,British Museum,6479952.0,London,United Kingdom,GB,8961989
4,"Natural History Museum, South Kensington",6301972.0,London,United Kingdom,GB,8961989
...,...,...,...,...,...,...
81,Museum of European and Mediterranean Civilisat...,1300000.0,Marseille,France,FR,870731
82,Chinese Aviation Museum,1292278.0,Beijing,China,CN,18960744
83,Scottish National Gallery,1277230.0,Edinburgh,United Kingdom,GB,514990
84,Museo Reina Sofía,1253183.0,Madrid,Spain,ES,3255944


In [17]:
data["ratio"] = data["Visitors in 2024"] / data["Population"]

In [18]:
Q1 = data["ratio"].quantile(0.25)
Q3 = data["ratio"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

data = data[(data["ratio"] >= lower_bound) & (data["ratio"] <= upper_bound)]

In [20]:
data = data.reset_index(drop=True)

In [21]:
data

Unnamed: 0,Name,Visitors in 2024,City,Country,Country Code,Population,ratio
0,National Museum of China,6956800.0,Beijing,China,CN,18960744,0.366905
1,British Museum,6479952.0,London,United Kingdom,GB,8961989,0.723048
2,"Natural History Museum, South Kensington",6301972.0,London,United Kingdom,GB,8961989,0.703189
3,Metropolitan Museum of Art,5727258.0,New York City,United States,US,19006798,0.301327
4,American Museum of Natural History,5400000.0,New York City,United States,US,19006798,0.284109
...,...,...,...,...,...,...,...
71,Palacio de Cristal del Retiro,1318823.0,Madrid,Spain,ES,3255944,0.405051
72,Museum of European and Mediterranean Civilisat...,1300000.0,Marseille,France,FR,870731,1.492998
73,Chinese Aviation Museum,1292278.0,Beijing,China,CN,18960744,0.068155
74,Museo Reina Sofía,1253183.0,Madrid,Spain,ES,3255944,0.384891


In [None]:
X = data[['Population']]

y = data['Visitors in 2024']

In [None]:
model = LinearRegression()
model.fit(X, y)

In [None]:
print(f"Coefficient (slope): {model.coef_[0]}")
print(f"Intercept: {model.intercept_}")

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt

In [None]:
y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = mse ** 0.5

print("Coefficient (slope):", model.coef_[0])
print("Intercept:", model.intercept_)
print("R²:", r2)
print("RMSE:", rmse)

plt.scatter(X, y, label="Actual", alpha=0.6)
plt.plot(X, y_pred, color="red", label="Predicted")
plt.xlabel("Population")
plt.ylabel("Visitors")
plt.legend()
plt.show()

plt.scatter(X, y - y_pred, alpha=0.6)
plt.axhline(0, color="red", linestyle="--")
plt.xlabel("Population")
plt.ylabel("Residuals (Actual - Predicted)")
plt.show()

In [None]:
df = data.dropna(subset=["Visitors in 2024", "Population"]).copy()

X = df[["Population"]].values
y = df["Visitors in 2024"].values

lin_reg = LinearRegression()
lin_reg.fit(X, y)

print("Simple Linear Regression")
print(f"Coefficient (slope): {lin_reg.coef_[0]:.6f}")
print(f"Intercept: {lin_reg.intercept_:.2f}")
print(f"R² score: {lin_reg.score(X, y):.4f}")

In [None]:
X_log = np.log1p(df["Population"].values.reshape(-1, 1))
y_log = np.log1p(df["Visitors in 2024"].values)

log_reg = LinearRegression()
log_reg.fit(X_log, y_log)

print("📊 Log-Log Regression")
print(f"Coefficient (slope): {log_reg.coef_[0]:.6f}")
print(f"Intercept: {log_reg.intercept_:.2f}")
print(f"R² score: {log_reg.score(X_log, y_log):.4f}")

In [7]:
import statsmodels.api as sm

df = data.dropna(subset=["Visitors in 2024", "Population"]).copy()

X = df["Population"]
y = df["Visitors in 2024"]

X_const = sm.add_constant(X)

model_lin = sm.OLS(y, X_const).fit()
print("📊 Simple Linear Regression (statsmodels)")
print(model_lin.summary())

NameError: name 'data' is not defined