In [4]:
import pandas as pd
import numpy as np


restaurants_ds = pd.read_csv("restaurants_df.csv")
restaurants_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4343 entries, 0 to 4342
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         4343 non-null   object
 1   Address      4343 non-null   object
 2   P-nummer     4343 non-null   int64 
 3   Startdate    4306 non-null   object
 4   Enddate      188 non-null    object
 5   code         4343 non-null   int64 
 6   active       4343 non-null   bool  
 7   postal_code  4343 non-null   int64 
dtypes: bool(1), int64(3), object(4)
memory usage: 241.9+ KB


In [5]:
population_ds = pd.read_csv("population_df.csv")
population_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474 entries, 0 to 473
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   neighborhood_code       474 non-null    int64  
 1   neighborhood_name       474 non-null    object 
 2   postal_code             474 non-null    int64  
 3   postal_area             474 non-null    object 
 4   postal_code_m2          474 non-null    int64  
 5   Total                   474 non-null    int64  
 6   Men                     474 non-null    int64  
 7   Women                   474 non-null    int64  
 8   population_density_km2  471 non-null    float64
 9   restaurant_count        474 non-null    int64  
dtypes: float64(1), int64(7), object(2)
memory usage: 37.2+ KB


In [11]:
footTraffic_ds = pd.read_csv(
    "C:/Users/Dell/OneDrive - Danmarks Tekniske Universitet/Git Hub/StayingAlive/expanded_pedestrian_traffic_interpolated.csv"
)
footTraffic_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7062 entries, 0 to 7061
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   latitude       7062 non-null   float64
 1   longitude      7062 non-null   float64
 2   aadt_fod_7_19  7062 non-null   float64
 3   hvdt_fod_7_19  7062 non-null   float64
dtypes: float64(4)
memory usage: 220.8 KB


In [16]:
mapsReviews_ds = pd.read_csv("scraping_correct/Datasets/maps_data_scraped.csv")
mapsReviews_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4552 entries, 0 to 4551
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Input Name     4552 non-null   object 
 1   Input Address  4552 non-null   object 
 2   Title          2527 non-null   object 
 3   Rating         2397 non-null   object 
 4   Reviews        2372 non-null   float64
 5   Price Level    920 non-null    object 
 6   Tags           1603 non-null   object 
dtypes: float64(1), object(6)
memory usage: 249.1+ KB


In [17]:
# mapping price level
# Mappatura basata sulla reference che abbiamo deciso
price_mapping = {
    "1-100 kr a persona": 1,
    "1-200 kr a persona": 1,
    "100-200 kr a persona": 1,
    "100-300 kr a persona": 2,
    "200-300 kr a persona": 2,
    "200-400 kr a persona": 2,
    "300-400 kr a persona": 3,
    "300-500 kr a persona": 3,
    "400-500 kr a persona": 3,
    "400-600 kr a persona": 3,
    "500-600 kr a persona": 3,
    "600-700 kr a persona": 4,
    "Più di 1000 kr a persona": 5,
    "2000-4000 kr a persona": 5,
    "1-10 â\x82¬ a persona": 1,  # 7–75 DKK ≈ low
    "30-40 â\x82¬ a persona": 2,  # 224–298 DKK ≈ mid-low
}

# Applichiamo la mappatura
mapsReviews_ds["price_level_mapped"] = mapsReviews_ds["Price Level"].map(price_mapping)

In [21]:
mapsReviews_ds["price_level_missing"] = mapsReviews_ds["price_level_mapped"].isna()
mapsReviews_ds["price_level_mapped"] = mapsReviews_ds["price_level_mapped"].fillna(2)

In [28]:
# Prima assicuriamoci che Rating sia numerico
mapsReviews_ds["Rating"] = pd.to_numeric(mapsReviews_ds["Rating"], errors="coerce")

# Crea flag se manca
mapsReviews_ds["rating_missing"] = mapsReviews_ds["Rating"].isna()

# Riempie NaN con valore medio (3.5)
mapsReviews_ds["Rating"] = mapsReviews_ds["Rating"].fillna(3.5)

In [29]:
# Assicuriamoci che Reviews sia numerico
mapsReviews_ds["Reviews"] = pd.to_numeric(mapsReviews_ds["Reviews"], errors="coerce")

# Crea flag se manca
mapsReviews_ds["reviews_missing"] = mapsReviews_ds["Reviews"].isna()

# Riempie NaN con un valore tipo 5 recensioni
mapsReviews_ds["Reviews"] = mapsReviews_ds["Reviews"].fillna(5)

In [30]:
mapsReviews_ds.to_csv("mapsReviews_ds_NoNull.csv", index=False)

# MERGE ALL DATASETS

## Restaurants + Reviews

In [31]:
def clean_text(text):
    if pd.isna(text):
        return ""
    return str(text).lower().strip().replace("  ", " ")


# Puliamo nei due dataset
restaurants_ds["Name_clean"] = restaurants_ds["Name"].apply(clean_text)
restaurants_ds["Address_clean"] = restaurants_ds["Address"].apply(clean_text)

mapsReviews_ds["Input Name_clean"] = mapsReviews_ds["Input Name"].apply(clean_text)
mapsReviews_ds["Input Address_clean"] = mapsReviews_ds["Input Address"].apply(
    clean_text
)

In [32]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-win_amd64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
    --------------------------------------- 0.0/1.6 MB 145.2 kB/s eta 0:00:12
   - -------------------------------------- 0.1/1.6 MB 328.2 kB/s eta 0:00:05
   - -------------------------------------- 0.1/1.6 MB 328.2 kB/s eta 0:00:05
   - -------------------------------------- 0.1/1.6 MB 231.0 kB/s eta 0:00:07
   ---- ----------------------------------- 0.2/1.6 MB 517.2 kB/s eta 0:00:03
   ---- ----------------------------------- 0.2/1.6 MB 517.2 kB/s eta 0:00:03
   ------ --------------------------------- 0.3/1.6 MB 630.5 kB/s eta 0:00:03
   ------ ---------


[notice] A new release of pip is available: 23.3.1 -> 25.0.1
[notice] To update, run: C:\Users\Dell\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [33]:
from rapidfuzz import process

# Creiamo dizionario dei nomi delle review
review_names = mapsReviews_ds["Input Name_clean"].tolist()


def fuzzy_match(name):
    match = process.extractOne(
        name, review_names, score_cutoff=80
    )  # solo se >80% di similarità
    if match:
        return match[0]
    return None


# Applichiamo fuzzy matching
restaurants_ds["matched_name"] = restaurants_ds["Name_clean"].apply(fuzzy_match)

# Poi merge
merged_df = restaurants_ds.merge(
    mapsReviews_ds,
    how="left",
    left_on="matched_name",
    right_on="Input Name_clean",
    suffixes=("", "_review"),
)

In [35]:
# Supponiamo che il dataframe merged si chiami merged_df

# Quanti ristoranti hanno trovato Rating (cioè matchato?)
matched = merged_df["Rating"].notna().sum()

# Quanti ristoranti NON hanno trovato Rating?
unmatched = merged_df["Rating"].isna().sum()

# Quanti ristoranti totali?
total = len(merged_df)

# Percentuali
matched_pct = (matched / total) * 100
unmatched_pct = (unmatched / total) * 100

# Stampa il risultato
print(f"Matched: {matched} ({matched_pct:.2f}%)")
print(f"Unmatched: {unmatched} ({unmatched_pct:.2f}%)")

Matched: 4721 (100.00%)
Unmatched: 0 (0.00%)


## merged_df + population

In [38]:
assert population_ds[
    "postal_code"
].is_unique, "Errore: postal_code duplicati in population_ds"

In [39]:
merged_df = merged_df.merge(
    population_ds[
        ["postal_code", "Total", "population_density_km2", "restaurant_count"]
    ],
    how="left",
    on="postal_code",
    suffixes=("", "_population"),
)

In [40]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4721 entries, 0 to 4720
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Name                    4721 non-null   object 
 1   Address                 4721 non-null   object 
 2   P-nummer                4721 non-null   int64  
 3   Startdate               4672 non-null   object 
 4   Enddate                 211 non-null    object 
 5   code                    4721 non-null   int64  
 6   active                  4721 non-null   bool   
 7   postal_code             4721 non-null   int64  
 8   Name_clean              4721 non-null   object 
 9   Address_clean           4721 non-null   object 
 10  matched_name            4721 non-null   object 
 11  Input Name              4721 non-null   object 
 12  Input Address           4721 non-null   object 
 13  Title                   2661 non-null   object 
 14  Rating                  4721 non-null   