In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load the ORIGINAL listings data
listings = pd.read_csv('data/listings.csv.gz', compression='gzip')

print("=== LISTINGS OVERVIEW ===")
print(f"Shape: {listings.shape}")
print(f"\nColumns: {listings.shape[1]}")
print(listings.columns.tolist())
print(f"\nData types:\n{listings.dtypes}")
print(f"\nMissing values (%):\n{(listings.isnull().sum() / len(listings) * 100).round(2)}")
print(f"\nFirst row:")
listings.head(1)



=== LISTINGS OVERVIEW ===
Shape: (96871, 79)

Columns: 79
['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name', 'description', 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability', 'availabi

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,13913,https://www.airbnb.com/rooms/13913,20250914034649,2025-09-16,city scrape,Holiday London DB Room Let-on going,My bright double bedroom with a large window h...,Finsbury Park is a friendly melting pot commun...,https://a0.muscache.com/pictures/miso/Hosting-...,54730,...,4.87,4.78,4.78,,f,2,1,1,0,0.3


In [4]:
# Identify column types for targeted cleaning
print("=== COLUMN ANALYSIS ===\n")

# Numeric columns
numeric_cols = listings.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric columns ({len(numeric_cols)}):\n{numeric_cols}\n")

# Text columns
text_cols = listings.select_dtypes(include=['object']).columns.tolist()
print(f"Text columns ({len(text_cols)}):\n{text_cols}\n")

# Check sample values from text columns
print("=== TEXT COLUMN SAMPLES ===\n")
for col in text_cols[:10]:  # First 10 text columns
    sample = listings[col].dropna().head(2).tolist()
    print(f"{col}: {sample}\n")


=== COLUMN ANALYSIS ===

Numeric columns (45):
['id', 'scrape_id', 'host_id', 'host_listings_count', 'host_total_listings_count', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'availability_eoy', 'number_of_reviews_ly', 'estimated_occupancy_l365d', 'estimated_revenue_l365d', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'license', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_c

In [5]:
# Identify column types for targeted cleaning
print("=== COLUMN ANALYSIS ===\n")

# Numeric columns
numeric_cols = listings.select_dtypes(include=[np.number]).columns.tolist()
print(f" Numeric columns ({len(numeric_cols)}):")
print(numeric_cols)

# Text columns
text_cols = listings.select_dtypes(include=['object']).columns.tolist()
print(f"\n Text columns ({len(text_cols)}):")
for col in text_cols:
    sample = listings[col].dropna().iloc[0] if len(listings[col].dropna()) > 0 else "N/A"
    print(f"  {col}: {str(sample)[:60]}")

# Boolean columns
print(f"\n Sample values to detect types:")
for col in text_cols[:5]:
    sample = listings[col].dropna().head(2).tolist()
    print(f"  {col}: {sample}")


=== COLUMN ANALYSIS ===

 Numeric columns (45):
['id', 'scrape_id', 'host_id', 'host_listings_count', 'host_total_listings_count', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'availability_eoy', 'number_of_reviews_ly', 'estimated_occupancy_l365d', 'estimated_revenue_l365d', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'license', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_

In [6]:
# ===== CELL 3: COMPREHENSIVE DATA CLEANING =====

print("=== STARTING DATA CLEANING ===\n")

# 1. CLEAN PERCENTAGE COLUMNS
print("  CLEANING PERCENTAGE COLUMNS")
percentage_cols = [col for col in listings.columns if 'rate' in col.lower() or 'percent' in col.lower()]
print(f"   Found: {percentage_cols}")

for col in percentage_cols:
    if col in listings.columns and listings[col].dtype == 'object':
        listings[col] = listings[col].str.replace('%', '', regex=False).str.strip()
        listings[col] = pd.to_numeric(listings[col], errors='coerce') / 100
        print(f"   ✓ {col} → converted to decimal")

# 2. CLEAN BOOLEAN COLUMNS (t/f to True/False)
print("\n  CLEANING BOOLEAN COLUMNS")
boolean_map = {'t': True, 'f': False, 'T': True, 'F': False}

for col in text_cols:
    if col in listings.columns:
        unique_vals = listings[col].dropna().unique()
        if len(unique_vals) <= 2 and any(v in boolean_map for v in unique_vals):
            listings[col] = listings[col].map(boolean_map)
            listings[col] = listings[col].fillna(False)
            print(f"   ✓ {col} → converted to boolean")

# 3. CLEAN PRICE COLUMNS
print("\n  CLEANING PRICE COLUMNS")
price_cols = [col for col in listings.columns if 'price' in col.lower()]
for col in price_cols:
    if col in listings.columns and listings[col].dtype == 'object':
        listings[col] = listings[col].str.replace('$', '', regex=False)
        listings[col] = listings[col].str.replace(',', '', regex=False)
        listings[col] = pd.to_numeric(listings[col], errors='coerce')
        print(f"   ✓ {col} → cleaned & converted to numeric")

# 4. CLEAN NUMERIC COLUMNS - FILL MISSING
print("\n  FILLING MISSING NUMERIC VALUES")
numeric_cols = listings.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols:
    missing_count = listings[col].isnull().sum()
    if missing_count > 0:
        listings[col].fillna(listings[col].median(), inplace=True)
        print(f"   ✓ {col}: filled {missing_count} missing values with median")

# 5. CLEAN TEXT COLUMNS - FILL MISSING
print("\n  FILLING MISSING TEXT VALUES")
text_cols = listings.select_dtypes(include=['object']).columns.tolist()
for col in text_cols:
    missing_count = listings[col].isnull().sum()
    if missing_count > 0:
        mode_val = listings[col].mode()
        if len(mode_val) > 0:
            listings[col].fillna(mode_val[0], inplace=True)
        else:
            listings[col].fillna('Unknown', inplace=True)
        print(f"   ✓ {col}: filled {missing_count} missing values")

# 6. CONVERT DATE COLUMNS
print("\n  CONVERTING DATE COLUMNS")
date_patterns = ['date', 'since', 'review', 'last']
for col in listings.columns:
    if any(pattern in col.lower() for pattern in date_patterns):
        if listings[col].dtype == 'object':
            listings[col] = pd.to_datetime(listings[col], errors='coerce')
            print(f"   ✓ {col} → converted to datetime")

# 7. DETECT & HANDLE OUTLIERS
print("\n  DETECTING OUTLIERS")
numeric_cols = listings.select_dtypes(include=[np.number]).columns.tolist()
outlier_cols = []

for col in numeric_cols:
    Q1 = listings[col].quantile(0.25)
    Q3 = listings[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = ((listings[col] < lower_bound) | (listings[col] > upper_bound)).sum()
    if outliers > 0:
        # CAP instead of remove
        listings[col] = listings[col].clip(lower=lower_bound, upper=upper_bound)
        outlier_cols.append((col, outliers, lower_bound, upper_bound))
        print(f"   ✓ {col}: capped {outliers} outliers [{lower_bound:.2f} - {upper_bound:.2f}]")

# 8. FINAL VALIDATION
print("\n  FINAL VALIDATION")
print(f"   Shape: {listings.shape}")
print(f"   Missing values: {listings.isnull().sum().sum()}")
print(f"   Duplicates: {listings.duplicated().sum()}")

print("\n DATA CLEANING COMPLETE!")


=== STARTING DATA CLEANING ===

  CLEANING PERCENTAGE COLUMNS
   Found: ['host_response_rate', 'host_acceptance_rate']
   ✓ host_response_rate → converted to decimal
   ✓ host_acceptance_rate → converted to decimal

  CLEANING BOOLEAN COLUMNS
   ✓ host_is_superhost → converted to boolean
   ✓ host_has_profile_pic → converted to boolean
   ✓ host_identity_verified → converted to boolean
   ✓ has_availability → converted to boolean
   ✓ instant_bookable → converted to boolean

  CLEANING PRICE COLUMNS
   ✓ price → cleaned & converted to numeric

  FILLING MISSING NUMERIC VALUES
   ✓ host_response_rate: filled 31707 missing values with median
   ✓ host_acceptance_rate: filled 27760 missing values with median
   ✓ host_listings_count: filled 41 missing values with median
   ✓ host_total_listings_count: filled 41 missing values with median
   ✓ neighbourhood_group_cleansed: filled 96871 missing values with median
   ✓ bathrooms: filled 34846 missing values with median
   ✓ bedrooms: filled 