In [13]:
import sys
import os

In [5]:
import pandas as pd
df_raw = pd.read_csv("data/raw/listings.csv", low_memory=False)
print(df_raw.columns.tolist())


['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'number_of_reviews_ltm', 'license']


In [18]:
import pandas as pd
import numpy as np

def clean_listings(filepath):
    df = pd.read_csv(filepath, low_memory=False)

    # Only use columns that exist in your dataset
    cols = [
        'id', 'name', 'host_id', 'neighbourhood', 'latitude', 'longitude',
        'room_type', 'price', 'minimum_nights', 'number_of_reviews',
        'reviews_per_month', 'availability_365'
    ]
    df = df[cols]

    # Clean price (remove $ and commas)
    df['price'] = df['price'].replace(r'[\$,]', '', regex=True).astype(float)


    # Estimate occupancy rate from reviews_per_month
    df['occupancy_rate'] = df['reviews_per_month'] / 30
    df['occupancy_rate'] = df['occupancy_rate'].clip(upper=1.0).fillna(0.5)  # default to 50%

    # Estimate monthly revenue using availability_365 scaled to 30 days
    df['estimated_monthly_revenue'] = (
        df['price'] * (df['availability_365'] / 12) * df['occupancy_rate']
    )

    return df


In [6]:
from src.clean_listings import clean_listings

df = clean_listings("data/raw/listings.csv")
df.head()


Unnamed: 0,id,name,host_id,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,availability_365,occupancy_rate,estimated_monthly_revenue
0,6422,Nashville Charm,12172,District 6,36.17143,-86.7357,Private room,43.0,30,669,3.4,230,0.113333,93.405556
1,39870,Close to Vanderbilt 2,171184,District 25,36.12466,-86.81269,Private room,70.0,1,564,5.29,230,0.176333,236.580556
2,59576,Large Main Suite near Lake *ladies only NS plz,812128,District 12,36.1894,-86.59162,Private room,,30,3,0.08,86,0.002667,
3,72906,Vandy/Belmont/10 mins to Broadway - Sunny 800 ...,176117,District 18,36.13122,-86.80066,Entire home/apt,85.0,2,765,4.48,215,0.149333,227.422222
4,258817,"ButterflyRoom-queen room, private bath",22296,District 12,36.16076,-86.59151,Private room,34.0,30,97,0.59,365,0.019667,20.338611


In [4]:
df.groupby("neighbourhood")["estimated_monthly_revenue"] \
  .mean().sort_values(ascending=False).head(20)


neighbourhood
District 34    5909.271861
District 23    1671.394279
District 15    1584.827315
District 35    1243.813931
District 33     821.942400
District 22     606.039032
District 17     581.759226
District 20     532.125359
District 19     532.100711
District 14     518.867339
District 25     501.447135
District 2      479.357379
District 28     462.747726
District 3      460.809384
District 5      453.853443
District 8      448.914831
District 13     425.848805
District 18     421.450190
District 21     403.406430
District 31     392.193907
Name: estimated_monthly_revenue, dtype: float64

In [2]:
import sys
import os
sys.path.append(os.path.abspath("."))  # Adjust if needed

from src.clean_listings import clean_listings

# Run the cleaner on the raw Nashville data
df = clean_listings("data/raw/listings.csv")

# Save the cleaned data to the correct location for Streamlit
os.makedirs("data/processed", exist_ok=True)
df.to_csv("data/processed/nashville_cleaned.csv", index=False)

print("✅ Cleaned file saved to data/processed/nashville_cleaned.csv")


✅ Cleaned file saved to data/processed/nashville_cleaned.csv
