In [None]:
import kagglehub
import pandas as pd
import os

#### 1. Start: download dataset

In [None]:
print("⚡︎ Downloading dataset...")
path = kagglehub.dataset_download("kritikseth/us-airbnb-open-data")
print(f"✓ Dataset downloaded to: {path}")

#### 2. Start: listing CSV files

In [None]:
print("\n ⚡︎ Scanning dataset folder for CSV files...")
csv_files = [f for f in os.listdir(path) if f.lower().endswith(".csv")]
print(f"✓ CSV files found: {csv_files}")

dataframes = {}
for csv in csv_files:
    print(f"\n ⚡︎ Loading {csv} ...")
    csv_path = os.path.join(path, csv)
    df = pd.read_csv(csv_path, low_memory=False)
    dataframes[csv] = df
    print(f"✓ {csv} loaded → shape: {df.shape}")

#### 3. Start: check for 2020 and 2023

In [None]:
print("\n ⚡︎ Checking for specific datasets (2020 & 2023)...")
df_2020 = dataframes.get("AB_US_2020.csv")
df_2023 = dataframes.get("AB_US_2023.csv")
print(f"✓ Found 2020 dataset: {df_2020 is not None}")
print(f"✓ Found 2023 dataset: {df_2023 is not None}")

#### 4. Clean up: drop unwanted columns if they exist

In [None]:
print("\n ⚡︎ Cleaning up datasets...")
print(f". Initial 2020 dataset shape: {df_2020.shape}")
print(f". Initial 2023 dataset shape: {df_2023.shape}")

to_drop = ["neighbourhood_group", "number_of_reviews_ltm"]

for col in to_drop:
    if col in df_2020.columns:
        df_2020 = df_2020.drop(columns=col)
    if col in df_2023.columns:
        df_2023 = df_2023.drop(columns=col) 



# Make sure the columns match
assert list(df_2020.columns) == list(df_2023.columns), (
    "Columns are not the same after cleanup!"
)

print(f"✓ 2020 dataset shape after cleanup: {df_2020.shape}")
print(f"✓ 2023 dataset shape after cleanup: {df_2023.shape}")

#### 5. Add 'year' column and merge

In [None]:
df_2020["year"] = "2020"
df_2023["year"] = "2023"

df_merged = pd.concat([df_2020, df_2023], ignore_index=True)

print(f"✓ Merged dataset shape: {df_merged.shape}")
print(f"✓ Columns: {df_merged.columns.tolist()}")

#### 6. Map each city to its US state

In [None]:
print("\n ⚡︎ Mapping cities to states...")

city_to_state = {
    "New York City": "NY",
    "Los Angeles": "CA",
    "Broward County": "FL",
    "San Diego": "CA",
    "Austin": "TX",
    "Hawaii": "HI",
    "Clark County": "NV",
    "Nashville": "TN",
    "Chicago": "IL",
    "San Francisco": "CA",
    "Washington D.C.": "DC",
    "New Orleans": "LA",
    "Seattle": "WA",
    "Twin Cities MSA": "MN",
    "Denver": "CO",
    "Portland": "OR",
    "Rhode Island": "RI",
    "Boston": "MA",
    "San Clara Country": "CA",
    "Santa Clara County": "CA",
    "San Mateo County": "CA",
    "Oakland": "CA",
    "Asheville": "NC",
    "Jersey City": "NJ",
    "Columbus": "OH",
    "Santa Cruz County": "CA",
    "Cambridge": "MA",
    "Salem": "MA",
    "Pacific Grove": "CA"
}

df_merged["state"] = df_merged["city"].map(city_to_state)
print(f"✓ 'state' column added with {df_merged['state'].nunique()} unique values.")

# Check if any city didn't get mapped
missing = df_merged[df_merged["state"].isna()]["city"].unique()
if len(missing):
    print(f"! Some cities are missing state mappings: {missing}")
    raise ValueError("Some cities are missing state mappings!" + str(missing))
pd.set_option('display.float_format', '{:.2f}'.format)

#### 7. Remove semantic duplicates (same host/listing details but different id)

In [None]:
print("\n ⚡︎ Removing semantic duplicates (same host/listing details but different id)...")
before = df_merged.shape
df_merged = df_merged.drop_duplicates(
    subset=["host_id", "name", "latitude", "longitude", "room_type", "price",
            "minimum_nights", "availability_365", "city", "year"],
    keep="first"
)
print(f"✓ Removed {before[0] - df_merged.shape[0]} duplicates. New shape: {df_merged.shape}")

#### 8. Remove duplicate of ids

In [None]:
print(f". Initial dataset shape before collapsing: {df_merged.shape}")
df_unique = df_merged.drop_duplicates(subset=['id', 'year']).copy()
print(f"✓ Shape after collapsing to unique (id, year): {df_unique.shape}")

#### 9. Filter minimum_nights to a reasonable range

In [None]:
print("\n ⚡︎ Filtering minimum_nights to a reasonable range...")
print(f". Initial dataset shape before filtering: {df_merged.shape}")
df_merged = df_merged[(df_merged["minimum_nights"] > 0) & (df_merged["minimum_nights"] < 2000)]
print(f"✓ Dataset shape after filtering: {df_merged.shape}")

#### 10. Recompute host listing count

In [None]:
print("\n ⚡︎ Recomputing listings per host/year...")
df_unique['calculated_host_listings_count'] = (
    df_unique.groupby(['host_id', 'year'])['id'].transform('nunique')
)
print("✓ Recomputed host listing counts successfully.")

#### 11. Export to CSV

In [None]:
output_dir = "out"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "dataset.csv")

print(f". Exporting cleaned dataset to: {output_path}")
df_unique.to_csv(output_path, index=False)
print("✓ CSV export completed successfully.")