In [15]:
import pandas as pd
import glob
import os
import re

# Data Cleaning

In [8]:
# Step 1: Load population data
population_df = pd.read_csv(
    "complete_cleaned_copenhagen_population.csv", encoding="ISO-8859-1"
)

In [9]:
population_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474 entries, 0 to 473
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   neighborhood_id    474 non-null    int64 
 1   neighborhood_name  474 non-null    object
 2   postal_code        474 non-null    int64 
 3   postal_area        474 non-null    object
 4   postal_code_m2     474 non-null    object
 5   Total              474 non-null    int64 
 6   Men                474 non-null    int64 
 7   Women              474 non-null    int64 
dtypes: int64(5), object(3)
memory usage: 29.8+ KB


In [12]:
# Cerca tutti i file CSV con nomi coerenti
file_paths = glob.glob("DatiTure/scraped_companies_*.csv")

all_dfs = []

for path in file_paths:
    filename = os.path.basename(path)

    # Estrai il codice a 6 cifre
    try:
        code = filename.split("_")[2]
    except IndexError:
        code = "000000"  # fallback se il nome non è nel formato atteso

    # Determina se è attivo
    is_active = filename.lower().endswith("active.csv")

    # Carica il file
    df = pd.read_csv(path)

    # Aggiungi le colonne
    df["code"] = code
    df["active"] = is_active

    all_dfs.append(df)

# Unisci tutto in un unico DataFrame
merged_df = pd.concat(all_dfs, ignore_index=True)

# Salva il risultato
merged_df.to_csv("merged_companies.csv", index=False)

print("✅ Unione completata. File salvato come merged_companies.csv")

✅ Unione completata. File salvato come merged_companies.csv


In [17]:
# Funzione per estrarre il codice postale a 4 cifre dopo la prima virgola
def extract_postal_code(address):
    try:
        # Cerca la sequenza ", XXXX" dove X sono cifre
        match = re.search(r",\s*(\d{4})", address)
        return match.group(1) if match else None
    except:
        return None


# Applica la funzione alla colonna Address
merged_df["postal_code"] = merged_df["Address"].apply(extract_postal_code)

In [None]:
# merged_df = merged_df.drop(columns=["Status"])
# merged_df = merged_df.drop(columns=["Company Type"])
merged_df.head()

Unnamed: 0,Name,Address,P-nummer,Startdate,Enddate,code,active,postal_code
0,Frankies Pizza Nørrebro,"Sortedam Dossering 3B, 2200 København N",1019849178,30.10.2014,01.05.2025,561110,True,2200
1,Søartilleriet,"Jorisvej 5, 2. 108, 2300 København S",1029736347,12.10.2023,,561110,True,2300
2,Indian Flames,"Griffenfeldsgade 13, 2200 København N",1029776292,01.11.2023,,561110,True,2200
3,Veksø cafe & Restaurant ApS,"Veksø Bygade 13, 3670 Veksø Sjælland",1029823061,08.11.2023,,561110,True,3670
4,Khun Juk ApS,"Nyhavn 63D, 1051 København K",1029838999,15.11.2023,,561110,True,1051


In [33]:
population_df = population_df.rename(columns={"neighborhood_id": "neighborhood_code"})
population_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474 entries, 0 to 473
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   neighborhood_code       474 non-null    int64  
 1   neighborhood_name       474 non-null    object 
 2   postal_code             474 non-null    object 
 3   postal_area             474 non-null    object 
 4   postal_code_m2          474 non-null    int32  
 5   Total                   474 non-null    int64  
 6   Men                     474 non-null    int64  
 7   Women                   474 non-null    int64  
 8   population_density_km2  471 non-null    float64
 9   restaurant_count        474 non-null    int32  
dtypes: float64(1), int32(2), int64(4), object(3)
memory usage: 33.5+ KB


In [29]:
population_df = population_df.replace("-", 0)

In [None]:
# Converte postal_code in stringa per evitare problemi nei join
population_df["postal_code"] = population_df["postal_code"].astype(str)
population_df["postal_code_m2"] = population_df["postal_code_m2"].astype(int)
population_df[""] = population_df["postal_code_m2"].astype(int)
population_df["postal_code_m2"] = population_df["postal_code_m2"].astype(int)
merged_df["postal_code"] = merged_df["postal_code"].astype(str)


# 1. Densità di popolazione (abitanti per km²)
population_df["population_density_km2"] = population_df["Total"] / (
    population_df["postal_code_m2"] / 1_000_000
)

# 2. Conteggio ristoranti per codice postale
restaurant_counts = (
    merged_df.groupby("postal_code").size().reset_index(name="restaurant_count")
)

# 3. Join dei ristoranti con il DataFrame della popolazione
population_df = population_df.merge(restaurant_counts, on="postal_code", how="left")
population_df["restaurant_count"] = (
    population_df["restaurant_count"].fillna(0).astype(int)
)

# Salva il nuovo dataset
population_df.to_csv("population_with_density_and_restaurants.csv", index=False)

In [31]:
population_df.head()

Unnamed: 0,neighborhood_id,neighborhood_name,postal_code,postal_area,postal_code_m2,Total,Men,Women,population_density_km2,restaurant_count
0,101,København,1050,København K,39053,20,10,10,512.124549,16
1,101,København,1051,København K,55643,399,191,208,7170.713297,28
2,101,København,1052,København K,6563,421,204,217,64147.493524,2
3,101,København,1053,København K,2434,404,181,223,165981.922761,2
4,101,København,1054,København K,4953,392,201,191,79143.95316,1
