In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# For missing value imputation
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# For preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# For outlier detection
from scipy import stats
from scipy.stats import boxcox

# For text processing
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# For train-test split
from sklearn.model_selection import train_test_split

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

print("‚úì All libraries imported successfully")

‚úì All libraries imported successfully


In [2]:
# Load all datasets
print("Loading datasets...")

# Load listings_details
listings_df = pd.read_csv('listings_details.csv')
print(f"‚úì Listings loaded: {listings_df.shape}")

# Load calendar
calendar_df = pd.read_csv('calendar.csv')
print(f"‚úì Calendar loaded: {calendar_df.shape}")

# Load reviews
reviews_df = pd.read_csv('reviews.csv')
print(f"‚úì Reviews loaded: {reviews_df.shape}")

# Load neighbourhoods
neighbourhoods_df = pd.read_csv('neighbourhoods.csv')
print(f"‚úì Neighbourhoods loaded: {neighbourhoods_df.shape}")

print("\n" + "="*80)
print("DATASET OVERVIEW")
print("="*80)

Loading datasets...
‚úì Listings loaded: (20030, 96)
‚úì Calendar loaded: (7310950, 4)
‚úì Reviews loaded: (431830, 2)
‚úì Neighbourhoods loaded: (22, 2)

DATASET OVERVIEW


In [3]:
# Explore listings dataset
print("\nüìä LISTINGS DATASET INFO:")
print(f"Shape: {listings_df.shape}")
print(f"\nFirst few rows:")
listings_df.head()


üìä LISTINGS DATASET INFO:
Shape: (20030, 96)

First few rows:


Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,house_rules,thumbnail_url,medium_url,picture_url,xl_picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,country_code,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,2818,https://www.airbnb.com/rooms/2818,20181206172549,2018-12-06,Quiet Garden View Room & Super Fast WiFi,Quiet Garden View Room & Super Fast WiFi,I'm renting a bedroom (room overlooking the ga...,Quiet Garden View Room & Super Fast WiFi I'm r...,none,"Indische Buurt (""Indies Neighborhood"") is a ne...",From week 38 to week 47 maintenance work to th...,The neighbourhood is well served by 24 hours p...,,,Please: - Leave your shoes in the entrance - ...,,,https://a0.muscache.com/im/pictures/10272854/8...,,3159,https://www.airbnb.com/users/show/3159,Daniel,2008-09-24,"Amsterdam, Noord-Holland, The Netherlands","Upon arriving in Amsterdam, one can imagine as...",within an hour,100%,,t,https://a0.muscache.com/im/users/3159/profile_...,https://a0.muscache.com/im/users/3159/profile_...,Indische Buurt,1.0,1.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,f,"Amsterdam, North Holland, Netherlands",Indische Buurt,Oostelijk Havengebied - Indische Buurt,,Amsterdam,North Holland,,Amsterdam,"Amsterdam, Netherlands",NL,Netherlands,52.365755,4.941419,f,Apartment,Private room,2,1.5,1.0,2.0,Real Bed,"{Internet,Wifi,""Paid parking off premises"",""Bu...",,$59.00,,"$1,500.00",$100.00,$50.00,1,$20.00,3,15,today,t,17,44,44,44,2018-12-06,248,2009-03-30,2018-11-28,97.0,10.0,10.0,10.0,10.0,9.0,10.0,f,,{Amsterdam},t,f,strict_14_with_grace_period,f,f,1,2.1
1,3209,https://www.airbnb.com/rooms/3209,20181206172549,2018-12-06,"Quiet apt near center, great view",You will love our spacious (90 m2) bright apar...,"Our apartment has lots of light, a balcony and...",You will love our spacious (90 m2) bright apar...,none,Welcome to the Spaarndammerbuurt! From the beg...,,"From Central Station, walk towards the busstop...",You will have the entire house to yourself.,We will meet you in person for check in whenev...,"Our house comes with our very sweet, but old (...",,,https://a0.muscache.com/im/pictures/88955424/4...,,3806,https://www.airbnb.com/users/show/3806,Maartje,2008-10-24,"Amsterdam, Noord-Holland, The Netherlands",I am a freelance radio producer and journalist...,within an hour,100%,,f,https://a0.muscache.com/im/users/3806/profile_...,https://a0.muscache.com/im/users/3806/profile_...,Spaarndammer en Zeeheldenbuurt,1.0,1.0,"['email', 'phone', 'reviews', 'jumio']",t,t,"Amsterdam, Noord-Holland, Netherlands",Spaarndammer en Zeeheldenbuurt,Westerpark,,Amsterdam,Noord-Holland,1013 XE,Amsterdam,"Amsterdam, Netherlands",NL,Netherlands,52.390225,4.873924,t,Apartment,Entire home/apt,5,1.0,2.0,2.0,Real Bed,"{Internet,Wifi,Kitchen,""Paid parking off premi...",,$160.00,$543.00,"$2,000.00",$300.00,$40.00,2,$15.00,4,20,7 weeks ago,t,0,0,0,47,2018-12-06,42,2015-07-31,2018-08-29,96.0,10.0,9.0,10.0,10.0,9.0,9.0,f,,{Amsterdam},f,f,moderate,f,f,1,1.03
2,20168,https://www.airbnb.com/rooms/20168,20181206172549,2018-12-06,100%Centre-Studio 1 Private Floor/Bathroom,"Cozy studio on your own private floor, 100% in...",For those who like all facets of city life. In...,"Cozy studio on your own private floor, 100% in...",none,Located just in between famous central canals....,Check-in time from 2pm till 10pm Checkout anyt...,No need to use any transport! All is within a ...,,"No curfew, free entrance 27/7 with your own ke...",This studio/room takes entire floor and has it...,,,https://a0.muscache.com/im/pictures/69979664/3...,,59484,https://www.airbnb.com/users/show/59484,Alex,2009-12-02,"Amsterdam, Noord-Holland, The Netherlands",Secondary phone nr. + (Phone number hidden by ...,within a few hours,100%,,f,https://a0.muscache.com/im/pictures/user/579c8...,https://a0.muscache.com/im/pictures/user/579c8...,Grachtengordel,2.0,2.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,f,"Amsterdam, North Holland, Netherlands",Grachtengordel,Centrum-Oost,,Amsterdam,North Holland,1017,Amsterdam,"Amsterdam, Netherlands",NL,Netherlands,52.365087,4.893541,t,Townhouse,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,Wifi,""Paid parking off premises"",...",,$80.00,,,,,2,$0.00,1,1000,today,t,0,7,24,198,2018-12-06,233,2010-03-02,2018-11-30,87.0,9.0,10.0,9.0,9.0,10.0,9.0,f,,{Amsterdam},f,f,strict_14_with_grace_period,f,f,2,2.18
3,25428,https://www.airbnb.com/rooms/25428,20181206172549,2018-12-06,Lovely apt in City Centre (Jordaan),,"This nicely furnished, newly renovated apt is...","This nicely furnished, newly renovated apt is...",none,,,,The apartment is about 75 meters or 800 square...,,"The building is a quiet building, so please do...",,,https://a0.muscache.com/im/pictures/138431/707...,,56142,https://www.airbnb.com/users/show/56142,Joan,2009-11-20,"New York, New York, United States","We are a retired couple who live in NYC, and h...",within a few hours,100%,,f,https://a0.muscache.com/im/users/56142/profile...,https://a0.muscache.com/im/users/56142/profile...,Grachtengordel,2.0,2.0,"['email', 'phone', 'reviews']",t,f,"Amsterdam, North Holland, Netherlands",Grachtengordel,Centrum-West,,Amsterdam,North Holland,1016,Amsterdam,"Amsterdam, Netherlands",NL,Netherlands,52.373114,4.883668,f,Apartment,Entire home/apt,3,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,Elevator,...",,$125.00,$650.00,"$2,000.00",$300.00,$40.00,2,$10.00,14,60,2 days ago,t,2,32,44,141,2018-12-06,1,2018-01-21,2018-01-21,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,,{Amsterdam},f,f,strict_14_with_grace_period,f,f,2,0.09
4,27886,https://www.airbnb.com/rooms/27886,20181206172549,2018-12-06,"Romantic, stylish B&B houseboat in canal district",Stylish and romantic houseboat on fantastic hi...,For a romantic couple: A beautifully restored ...,Stylish and romantic houseboat on fantastic hi...,none,"Central, quiet, safe, clean and beautiful.","we have a canadian canoe for you as well, free...","cental station aprox. 10 minutes on foot, buss...","Your own apartment, nothing shared","As much as they want, and is possible. I speak...","All the facilities are included ( cleaning , ...",,,https://a0.muscache.com/im/pictures/02c2da9d-6...,,97647,https://www.airbnb.com/users/show/97647,Flip,2010-03-23,"Amsterdam, Noord-Holland, The Netherlands","Marjan works in ""eye"" the dutch filmmuseum, an...",within an hour,100%,,t,https://a0.muscache.com/im/users/97647/profile...,https://a0.muscache.com/im/users/97647/profile...,Westelijke Eilanden,1.0,1.0,"['email', 'phone', 'reviews', 'jumio']",t,t,"Amsterdam, North Holland, Netherlands",Westelijke Eilanden,Centrum-West,,Amsterdam,North Holland,1013,Amsterdam,"Amsterdam, Netherlands",NL,Netherlands,52.386727,4.892078,t,Houseboat,Private room,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,Wifi,Breakfast,Heating,""Smoke det...",,$150.00,$810.00,"$2,500.00",$0.00,$0.00,1,$0.00,2,730,today,t,16,37,54,199,2018-12-06,171,2012-01-09,2018-11-25,99.0,10.0,10.0,10.0,10.0,10.0,10.0,f,,{Amsterdam},t,f,strict_14_with_grace_period,f,f,1,2.03


In [4]:
# Check data types and missing values
print("\nüìã COLUMN INFO:")
listings_df.info()



üìã COLUMN INFO:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20030 entries, 0 to 20029
Data columns (total 96 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                20030 non-null  int64  
 1   listing_url                       20030 non-null  object 
 2   scrape_id                         20030 non-null  int64  
 3   last_scraped                      20030 non-null  object 
 4   name                              19992 non-null  object 
 5   summary                           19510 non-null  object 
 6   space                             14579 non-null  object 
 7   description                       19906 non-null  object 
 8   experiences_offered               20030 non-null  object 
 9   neighborhood_overview             13257 non-null  object 
 10  notes                             9031 non-null   object 
 11  transit                           13635 non-null

In [5]:
# Detailed missing value analysis
print("\nüîç MISSING VALUES ANALYSIS:")
missing_data = pd.DataFrame({
    'Column': listings_df.columns,
    'Missing_Count': listings_df.isnull().sum(),
    'Missing_Percentage': (listings_df.isnull().sum() / len(listings_df) * 100).round(2)
})
missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)
print(missing_data.to_string(index=False))


üîç MISSING VALUES ANALYSIS:
                      Column  Missing_Count  Missing_Percentage
              xl_picture_url          20030              100.00
                  medium_url          20030              100.00
               thumbnail_url          20030              100.00
        host_acceptance_rate          20030              100.00
neighbourhood_group_cleansed          20030              100.00
                     license          20021               99.96
                 square_feet          19624               97.97
               monthly_price          18469               92.21
                weekly_price          17187               85.81
                       notes          10999               54.91
          host_response_time           9483               47.34
          host_response_rate           9483               47.34
                  host_about           8227               41.07
                 interaction           8058               40.23
         

In [6]:
# Explore calendar dataset
print("\nüìÖ CALENDAR DATASET:")
print(f"Shape: {calendar_df.shape}")
print(f"\nSample data:")
print(calendar_df.head())
print(f"\nData types:")
print(calendar_df.dtypes)
print(f"\nMissing values:")
print(calendar_df.isnull().sum())



üìÖ CALENDAR DATASET:
Shape: (7310950, 4)

Sample data:
   listing_id        date available price
0        2818  2019-12-05         f   NaN
1       73208  2019-08-30         f   NaN
2       73208  2019-08-29         f   NaN
3       73208  2019-08-28         f   NaN
4       73208  2019-08-27         f   NaN

Data types:
listing_id     int64
date          object
available     object
price         object
dtype: object

Missing values:
listing_id          0
date                0
available           0
price         6110879
dtype: int64


In [7]:
# Explore reviews dataset
print("\nüí¨ REVIEWS DATASET:")
print(f"Shape: {reviews_df.shape}")
print(f"\nSample data:")
print(reviews_df.head())
print(f"\nData types:")
print(reviews_df.dtypes)
print(f"\nMissing values:")
print(reviews_df.isnull().sum())



üí¨ REVIEWS DATASET:
Shape: (431830, 2)

Sample data:
   listing_id        date
0        2818  2009-03-30
1        2818  2009-04-24
2        2818  2009-05-03
3        2818  2009-05-18
4        2818  2009-05-25

Data types:
listing_id     int64
date          object
dtype: object

Missing values:
listing_id    0
date          0
dtype: int64


In [None]:
# Step 1: Data Cleaning - Create Working Copy and Initial Cleanup


In [8]:
# Create working copies
df = listings_df.copy()

print("="*80)
print("STEP 1: INITIAL DATA CLEANING")
print("="*80)

# 1.1 Check for duplicate records
print(f"\n1. Duplicate Records:")
print(f"   Duplicate rows in listings: {df.duplicated().sum()}")
print(f"   Duplicate IDs in listings: {df['id'].duplicated().sum()}")

# Remove duplicates based on id
df = df.drop_duplicates(subset=['id'], keep='first')
print(f"   ‚úì After removing duplicates: {df.shape}")


STEP 1: INITIAL DATA CLEANING

1. Duplicate Records:
   Duplicate rows in listings: 0
   Duplicate IDs in listings: 0
   ‚úì After removing duplicates: (20030, 96)


In [9]:
# 1.2 Drop columns that are not useful for prediction
print(f"\n2. Dropping Irrelevant Columns:")

# Columns to drop: URLs, IDs that won't help prediction, redundant text fields
cols_to_drop = [
    'listing_url', 'scrape_id', 'last_scraped', 'thumbnail_url', 'medium_url',
    'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_thumbnail_url',
    'host_picture_url', 'license', 'jurisdiction_names', 'calendar_last_scraped',
    'experiences_offered',  # Almost always 'none'
    'neighbourhood_group_cleansed',  # Empty for Amsterdam
    'medium_url', 'xl_picture_url'  # Image URLs
]

# Also drop columns with 100% missing values
missing_100_cols = df.columns[df.isnull().mean() == 1.0].tolist()
cols_to_drop.extend(missing_100_cols)

# Drop columns that exist
cols_dropped = [c for c in cols_to_drop if c in df.columns]
df = df.drop(columns=cols_dropped)

print(f"   Dropped {len(cols_dropped)} columns")
print(f"   ‚úì Shape after dropping columns: {df.shape}")



2. Dropping Irrelevant Columns:
   Dropped 23 columns
   ‚úì Shape after dropping columns: (20030, 79)


In [10]:
# 1.3 Type Conversion - Fix data types
print(f"\n3. Type Conversion:")

# Function to clean price columns (remove $ and ,)
def clean_price(price_str):
    if pd.isna(price_str):
        return np.nan
    if isinstance(price_str, (int, float)):
        return float(price_str)
    # Remove $ and , and convert to float
    return float(str(price_str).replace('$', '').replace(',', ''))

# Price columns to clean
price_cols = ['price', 'weekly_price', 'monthly_price', 'security_deposit', 
              'cleaning_fee', 'extra_people']

for col in price_cols:
    if col in df.columns:
        df[col] = df[col].apply(clean_price)
        print(f"   ‚úì Converted {col} to numeric")

# Convert percentage columns
percentage_cols = ['host_response_rate', 'host_acceptance_rate']
for col in percentage_cols:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: float(str(x).replace('%', '')) / 100 if pd.notna(x) else np.nan)
        print(f"   ‚úì Converted {col} to decimal")

# Convert boolean columns (t/f to True/False)
bool_cols = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified',
             'is_location_exact', 'has_availability', 'instant_bookable', 
             'is_business_travel_ready', 'require_guest_profile_picture',
             'require_guest_phone_verification', 'requires_license']

for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].map({'t': True, 'f': False, True: True, False: False})
        print(f"   ‚úì Converted {col} to boolean")

# Convert numeric columns that are stored as object
numeric_cols = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'guests_included',
                'minimum_nights', 'maximum_nights', 'availability_30', 'availability_60',
                'availability_90', 'availability_365', 'number_of_reviews',
                'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
                'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                'review_scores_value', 'calculated_host_listings_count', 'reviews_per_month',
                'host_listings_count', 'host_total_listings_count', 'square_feet',
                'latitude', 'longitude']

for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

print(f"   ‚úì Converted numeric columns")

# Convert date columns
date_cols = ['host_since', 'first_review', 'last_review']
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        print(f"   ‚úì Converted {col} to datetime")



3. Type Conversion:
   ‚úì Converted price to numeric
   ‚úì Converted weekly_price to numeric
   ‚úì Converted monthly_price to numeric
   ‚úì Converted security_deposit to numeric
   ‚úì Converted cleaning_fee to numeric
   ‚úì Converted extra_people to numeric
   ‚úì Converted host_response_rate to decimal
   ‚úì Converted host_is_superhost to boolean
   ‚úì Converted host_has_profile_pic to boolean
   ‚úì Converted host_identity_verified to boolean
   ‚úì Converted is_location_exact to boolean
   ‚úì Converted has_availability to boolean
   ‚úì Converted instant_bookable to boolean
   ‚úì Converted is_business_travel_ready to boolean
   ‚úì Converted require_guest_profile_picture to boolean
   ‚úì Converted require_guest_phone_verification to boolean
   ‚úì Converted requires_license to boolean
   ‚úì Converted numeric columns
   ‚úì Converted host_since to datetime
   ‚úì Converted first_review to datetime
   ‚úì Converted last_review to datetime


In [11]:
# 1.4 Logic Error Detection
print(f"\n4. Logic Error Detection:")

# Check for negative prices
if 'price' in df.columns:
    negative_prices = (df['price'] < 0).sum()
    print(f"   Negative prices: {negative_prices}")
    if negative_prices > 0:
        df = df[df['price'] >= 0]
        print(f"   ‚úì Removed {negative_prices} listings with negative prices")

# Check for zero prices (likely errors or missing data)
zero_prices = (df['price'] == 0).sum()
print(f"   Zero prices: {zero_prices}")

# Check for unreasonable values
if 'accommodates' in df.columns:
    unreasonable_accommodates = (df['accommodates'] > 50).sum()
    print(f"   Accommodates > 50: {unreasonable_accommodates}")

# Check minimum_nights > maximum_nights
if 'minimum_nights' in df.columns and 'maximum_nights' in df.columns:
    logic_error = (df['minimum_nights'] > df['maximum_nights']).sum()
    print(f"   Min nights > Max nights: {logic_error}")
    if logic_error > 0:
        # Fix by swapping or removing
        mask = df['minimum_nights'] > df['maximum_nights']
        df.loc[mask, ['minimum_nights', 'maximum_nights']] = np.nan
        print(f"   ‚úì Fixed {logic_error} logic errors in nights")

# Check for future dates in last_review
if 'last_review' in df.columns:
    future_reviews = (df['last_review'] > pd.Timestamp.now()).sum()
    print(f"   Future review dates: {future_reviews}")

print(f"\n‚úì Data cleaning step 1 completed. Shape: {df.shape}")



4. Logic Error Detection:
   Negative prices: 0
   Zero prices: 2
   Accommodates > 50: 0
   Min nights > Max nights: 0
   Future review dates: 0

‚úì Data cleaning step 1 completed. Shape: (20030, 79)


In [None]:
# Step 2: Missing Value Analysis and Treatment


In [12]:
print("="*80)
print("STEP 2: MISSING VALUE TREATMENT")
print("="*80)

# Analyze missing values
missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2),
    'Dtype': df.dtypes
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

print("\nüîç Missing Values Summary:")
print(missing_df.to_string(index=False))

# Strategy:
# 1. > 70% missing: Drop column
# 2. 30-70% missing: Careful imputation or feature engineering
# 3. < 30% missing: Impute with median/mode or KNN


STEP 2: MISSING VALUE TREATMENT

üîç Missing Values Summary:
                     Column  Missing_Count  Missing_Percentage          Dtype
                square_feet          19624               97.97        float64
              monthly_price          18469               92.21        float64
               weekly_price          17187               85.81        float64
                      notes          10999               54.91         object
         host_response_time           9483               47.34         object
         host_response_rate           9483               47.34        float64
                 host_about           8227               41.07         object
                interaction           8058               40.23         object
                     access           7803               38.96         object
                house_rules           7459               37.24         object
      neighborhood_overview           6773               33.81         object
  

In [13]:
# 2.1 Drop columns with >70% missing values
print("\n1. Dropping columns with >70% missing values:")
cols_to_drop_missing = missing_df[missing_df['Missing_Percentage'] > 70]['Column'].tolist()
if cols_to_drop_missing:
    print(f"   Columns to drop: {cols_to_drop_missing}")
    df = df.drop(columns=cols_to_drop_missing)
    print(f"   ‚úì Dropped {len(cols_to_drop_missing)} columns")
else:
    print("   No columns with >70% missing values")

print(f"   Shape: {df.shape}")



1. Dropping columns with >70% missing values:
   Columns to drop: ['square_feet', 'monthly_price', 'weekly_price']
   ‚úì Dropped 3 columns
   Shape: (20030, 76)


In [14]:
# 2.2 Handle specific columns with moderate missing values (30-70%)
print("\n2. Handling columns with 30-70% missing values:")

# weekly_price and monthly_price: These can be derived from price
if 'weekly_price' in df.columns:
    print(f"   weekly_price missing: {df['weekly_price'].isnull().sum()} ({df['weekly_price'].isnull().mean()*100:.1f}%)")
    # Fill with price * 7 * 0.9 (typical weekly discount)
    df['weekly_price'] = df['weekly_price'].fillna(df['price'] * 7 * 0.9)
    print("   ‚úì Filled weekly_price with estimated value")

if 'monthly_price' in df.columns:
    print(f"   monthly_price missing: {df['monthly_price'].isnull().sum()} ({df['monthly_price'].isnull().mean()*100:.1f}%)")
    # Fill with price * 30 * 0.8 (typical monthly discount)
    df['monthly_price'] = df['monthly_price'].fillna(df['price'] * 30 * 0.8)
    print("   ‚úì Filled monthly_price with estimated value")

# security_deposit: Fill with 0 if missing (means no deposit required)
if 'security_deposit' in df.columns:
    print(f"   security_deposit missing: {df['security_deposit'].isnull().sum()} ({df['security_deposit'].isnull().mean()*100:.1f}%)")
    df['security_deposit'] = df['security_deposit'].fillna(0)
    print("   ‚úì Filled security_deposit with 0")

# cleaning_fee: Fill with median
if 'cleaning_fee' in df.columns:
    print(f"   cleaning_fee missing: {df['cleaning_fee'].isnull().sum()} ({df['cleaning_fee'].isnull().mean()*100:.1f}%)")
    median_cleaning_fee = df['cleaning_fee'].median()
    df['cleaning_fee'] = df['cleaning_fee'].fillna(median_cleaning_fee)
    print(f"   ‚úì Filled cleaning_fee with median: ${median_cleaning_fee:.2f}")

# host_neighbourhood: Fill with neighbourhood_cleansed
if 'host_neighbourhood' in df.columns and 'neighbourhood_cleansed' in df.columns:
    print(f"   host_neighbourhood missing: {df['host_neighbourhood'].isnull().sum()} ({df['host_neighbourhood'].isnull().mean()*100:.1f}%)")
    df['host_neighbourhood'] = df['host_neighbourhood'].fillna(df['neighbourhood_cleansed'])
    print("   ‚úì Filled host_neighbourhood with neighbourhood_cleansed")



2. Handling columns with 30-70% missing values:
   security_deposit missing: 6166 (30.8%)
   ‚úì Filled security_deposit with 0
   cleaning_fee missing: 3629 (18.1%)
   ‚úì Filled cleaning_fee with median: $35.00
   host_neighbourhood missing: 5808 (29.0%)
   ‚úì Filled host_neighbourhood with neighbourhood_cleansed


In [15]:
# 2.3 Impute numeric columns with <30% missing using different methods
print("\n3. Imputing numeric columns with <30% missing:")

# Identify numeric columns with missing values < 30%
numeric_cols_with_missing = []
for col in df.select_dtypes(include=[np.number]).columns:
    missing_pct = df[col].isnull().mean() * 100
    if 0 < missing_pct < 30:
        numeric_cols_with_missing.append(col)

print(f"   Numeric columns to impute: {len(numeric_cols_with_missing)}")

# Simple imputation with median for most numeric columns
if numeric_cols_with_missing:
    for col in numeric_cols_with_missing:
        if col in ['bathrooms', 'bedrooms', 'beds']:
            # For room counts, use median
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
            print(f"   ‚úì {col}: filled with median = {median_val}")
        elif col in ['reviews_per_month', 'review_scores_rating', 'review_scores_accuracy',
                     'review_scores_cleanliness', 'review_scores_checkin', 
                     'review_scores_communication', 'review_scores_location', 'review_scores_value']:
            # For review-related: fill with 0 (no reviews yet)
            df[col] = df[col].fillna(0)
            print(f"   ‚úì {col}: filled with 0 (no reviews)")
        else:
            # Other numeric: use median
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
            print(f"   ‚úì {col}: filled with median = {median_val:.2f}")



3. Imputing numeric columns with <30% missing:
   Numeric columns to impute: 13
   ‚úì host_listings_count: filled with median = 1.00
   ‚úì host_total_listings_count: filled with median = 1.00
   ‚úì bathrooms: filled with median = 1.0
   ‚úì bedrooms: filled with median = 1.0
   ‚úì beds: filled with median = 1.0
   ‚úì review_scores_rating: filled with 0 (no reviews)
   ‚úì review_scores_accuracy: filled with 0 (no reviews)
   ‚úì review_scores_cleanliness: filled with 0 (no reviews)
   ‚úì review_scores_checkin: filled with 0 (no reviews)
   ‚úì review_scores_communication: filled with 0 (no reviews)
   ‚úì review_scores_location: filled with 0 (no reviews)
   ‚úì review_scores_value: filled with 0 (no reviews)
   ‚úì reviews_per_month: filled with 0 (no reviews)


In [16]:
# 2.4 Handle categorical missing values
print("\n4. Imputing categorical columns:")

# Get categorical columns with missing values
categorical_cols_with_missing = []
for col in df.select_dtypes(include=['object', 'category']).columns:
    if df[col].isnull().sum() > 0:
        categorical_cols_with_missing.append(col)

print(f"   Categorical columns with missing: {len(categorical_cols_with_missing)}")

for col in categorical_cols_with_missing:
    missing_count = df[col].isnull().sum()
    missing_pct = (missing_count / len(df)) * 100
    
    if missing_pct > 30:
        # Drop or fill with 'Unknown'
        df[col] = df[col].fillna('Unknown')
        print(f"   ‚úì {col}: filled {missing_count} with 'Unknown' ({missing_pct:.1f}%)")
    else:
        # Fill with mode (most frequent value)
        mode_val = df[col].mode()[0] if len(df[col].mode()) > 0 else 'Unknown'
        df[col] = df[col].fillna(mode_val)
        print(f"   ‚úì {col}: filled {missing_count} with mode = '{mode_val}' ({missing_pct:.1f}%)")

# Handle boolean columns
for col in df.select_dtypes(include=['bool']).columns:
    if df[col].isnull().sum() > 0:
        # Fill with False (conservative assumption)
        df[col] = df[col].fillna(False)
        print(f"   ‚úì {col}: filled with False")

# Handle datetime columns
for col in df.select_dtypes(include=['datetime64']).columns:
    if df[col].isnull().sum() > 0:
        # Fill with median date or drop rows
        print(f"   ‚Ñπ {col}: has {df[col].isnull().sum()} missing dates (keeping as NaT for now)")



4. Imputing categorical columns:
   Categorical columns with missing: 23
   ‚úì name: filled 38 with mode = 'Amsterdam' (0.2%)
   ‚úì summary: filled 520 with mode = '**brand new** Sleeping up to 2 guests in luxury surroundings, this fantastic holiday home makes the perfect choice for your stay in Amsterdam!' (2.6%)
   ‚úì space: filled 5451 with mode = 'This gorgeous apartment is excellently designed with your comfort in mind. A warm inviting living space features chic wooden flooring throughout and deliciously contemporary furniture.  An elegant dining table and chairs enable you to make mealtimes comfortable and it‚Äôs of course the perfect time to plan tomorrow‚Äôs adventures in town together. Prepare favourite recipes from home or why not try a little Dutch cooking with local spices from the nearby convenience stores and supermarkets. The fantastic fitted kitchen is packed with all the equipment you need to create something delicious. The pretty bedroom is stylishly presented and

In [17]:
# 2.5 Final check for remaining missing values
print("\n5. Final missing value check:")
remaining_missing = df.isnull().sum().sum()
print(f"   Total missing values remaining: {remaining_missing}")

if remaining_missing > 0:
    print("\n   Columns still with missing values:")
    for col in df.columns:
        missing = df[col].isnull().sum()
        if missing > 0:
            print(f"      {col}: {missing} ({df[col].isnull().mean()*100:.2f}%)")

print(f"\n‚úì Missing value treatment completed. Shape: {df.shape}")



5. Final missing value check:
   Total missing values remaining: 14299

   Columns still with missing values:
      host_since: 4 (0.02%)
      host_response_rate: 9483 (47.34%)
      first_review: 2406 (12.01%)
      last_review: 2406 (12.01%)

‚úì Missing value treatment completed. Shape: (20030, 76)


In [None]:
# Step 3: Feature Engineering - Date/Time Features


In [18]:
print("="*80)
print("STEP 3: DATE/TIME FEATURE ENGINEERING")
print("="*80)

# Reference date for calculations
reference_date = pd.Timestamp('2018-12-06')  # Based on the scrape date

print(f"\nReference date: {reference_date}")

# 3.1 Host tenure (days since host joined)
if 'host_since' in df.columns:
    df['host_tenure_days'] = (reference_date - df['host_since']).dt.days
    df['host_tenure_years'] = df['host_tenure_days'] / 365.25
    print(f"‚úì Created host_tenure features")

# 3.2 Days since first review
if 'first_review' in df.columns:
    df['days_since_first_review'] = (reference_date - df['first_review']).dt.days
    df['days_since_first_review'] = df['days_since_first_review'].fillna(0)  # No reviews = 0
    print(f"‚úì Created days_since_first_review")

# 3.3 Days since last review (recency)
if 'last_review' in df.columns:
    df['days_since_last_review'] = (reference_date - df['last_review']).dt.days
    df['days_since_last_review'] = df['days_since_last_review'].fillna(9999)  # Large number for no reviews
    print(f"‚úì Created days_since_last_review")

# 3.4 Review period (days between first and last review)
if 'first_review' in df.columns and 'last_review' in df.columns:
    df['review_period_days'] = (df['last_review'] - df['first_review']).dt.days
    df['review_period_days'] = df['review_period_days'].fillna(0)
    print(f"‚úì Created review_period_days")

# 3.5 Extract components from host_since
if 'host_since' in df.columns:
    df['host_since_year'] = df['host_since'].dt.year
    df['host_since_month'] = df['host_since'].dt.month
    df['host_since_dayofweek'] = df['host_since'].dt.dayofweek
    print(f"‚úì Extracted date components from host_since")

# 3.6 Cyclical encoding for month (sine/cosine)
if 'host_since_month' in df.columns:
    df['host_since_month_sin'] = np.sin(2 * np.pi * df['host_since_month'] / 12)
    df['host_since_month_cos'] = np.cos(2 * np.pi * df['host_since_month'] / 12)
    print(f"‚úì Created cyclical encoding for host_since_month")

print(f"\n‚úì Date/time feature engineering completed. Shape: {df.shape}")


STEP 3: DATE/TIME FEATURE ENGINEERING

Reference date: 2018-12-06 00:00:00
‚úì Created host_tenure features
‚úì Created days_since_first_review
‚úì Created days_since_last_review
‚úì Created review_period_days
‚úì Extracted date components from host_since
‚úì Created cyclical encoding for host_since_month

‚úì Date/time feature engineering completed. Shape: (20030, 86)


In [None]:
# Step 4: Text Feature Processing


In [19]:
print("="*80)
print("STEP 4: TEXT FEATURE PROCESSING")
print("="*80)

# Function to clean text
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    return text.strip()

# 4.1 Create text length features (simpler than TF-IDF for now)
text_columns = ['name', 'summary', 'space', 'description', 'neighborhood_overview',
                'notes', 'transit', 'access', 'interaction', 'house_rules']

for col in text_columns:
    if col in df.columns:
        # Length features
        df[f'{col}_length'] = df[col].astype(str).str.len()
        df[f'{col}_word_count'] = df[col].astype(str).str.split().str.len()
        print(f"‚úì Created length features for {col}")

# 4.2 Extract amenities count (amenities is a list in text format)
if 'amenities' in df.columns:
    # Amenities are typically in format: {item1,item2,item3}
    df['amenities_count'] = df['amenities'].astype(str).str.count(',') + 1
    df['amenities_count'] = df['amenities_count'].replace({1: 0})  # Empty = 0
    print(f"‚úì Created amenities_count feature")
    
    # Check for specific amenities
    df['has_wifi'] = df['amenities'].str.contains('wifi|internet', case=False, na=False).astype(int)
    df['has_kitchen'] = df['amenities'].str.contains('kitchen', case=False, na=False).astype(int)
    df['has_tv'] = df['amenities'].str.contains('tv', case=False, na=False).astype(int)
    df['has_parking'] = df['amenities'].str.contains('parking', case=False, na=False).astype(int)
    df['has_ac'] = df['amenities'].str.contains('air conditioning|ac', case=False, na=False).astype(int)
    df['has_heating'] = df['amenities'].str.contains('heating', case=False, na=False).astype(int)
    print(f"‚úì Created specific amenity flags")

# 4.3 Host verifications count
if 'host_verifications' in df.columns:
    df['host_verifications_count'] = df['host_verifications'].astype(str).str.count(',') + 1
    print(f"‚úì Created host_verifications_count")

print(f"\n‚úì Text feature processing completed. Shape: {df.shape}")


STEP 4: TEXT FEATURE PROCESSING
‚úì Created length features for name
‚úì Created length features for summary
‚úì Created length features for space
‚úì Created length features for description
‚úì Created length features for neighborhood_overview
‚úì Created length features for notes
‚úì Created length features for transit
‚úì Created length features for access
‚úì Created length features for interaction
‚úì Created length features for house_rules
‚úì Created amenities_count feature
‚úì Created specific amenity flags
‚úì Created host_verifications_count

‚úì Text feature processing completed. Shape: (20030, 114)


In [None]:
# Step 5: Outlier Detection and Treatment


In [20]:
print("="*80)
print("STEP 5: OUTLIER DETECTION AND TREATMENT")
print("="*80)

# Function to detect outliers using IQR method
def detect_outliers_iqr(data, column, multiplier=1.5):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    outliers = ((data[column] < lower_bound) | (data[column] > upper_bound)).sum()
    return outliers, lower_bound, upper_bound

# Function to detect outliers using Z-score method
def detect_outliers_zscore(data, column, threshold=3):
    z_scores = np.abs(stats.zscore(data[column].dropna()))
    outliers = (z_scores > threshold).sum()
    return outliers

# Analyze outliers in key numeric columns
print("\nüìä Outlier Analysis (IQR method):")
outlier_cols = ['price', 'cleaning_fee', 'security_deposit', 'accommodates', 
                'bathrooms', 'bedrooms', 'beds', 'minimum_nights', 'maximum_nights',
                'number_of_reviews', 'reviews_per_month']

outlier_summary = []
for col in outlier_cols:
    if col in df.columns and df[col].dtype in [np.int64, np.float64]:
        outliers, lower, upper = detect_outliers_iqr(df, col)
        outlier_summary.append({
            'Column': col,
            'Outliers': outliers,
            'Percentage': f"{(outliers/len(df)*100):.2f}%",
            'Lower_Bound': f"{lower:.2f}",
            'Upper_Bound': f"{upper:.2f}"
        })

outlier_df = pd.DataFrame(outlier_summary)
print(outlier_df.to_string(index=False))


STEP 5: OUTLIER DETECTION AND TREATMENT

üìä Outlier Analysis (IQR method):
           Column  Outliers Percentage Lower_Bound Upper_Bound
            price      1272      6.35%      -22.50      293.50
     cleaning_fee      1640      8.19%        1.50       77.50
 security_deposit       594      2.97%     -375.00      625.00
     accommodates       169      0.84%       -1.00        7.00
        bathrooms      3705     18.50%        1.00        1.00
         bedrooms       582      2.91%       -0.50        3.50
             beds      2112     10.54%       -0.50        3.50
   minimum_nights      2278     11.37%        0.50        4.50
   maximum_nights         1      0.00%    -1635.00     2781.00
number_of_reviews      1903      9.50%      -25.50       50.50
reviews_per_month      1862      9.30%       -1.21        2.47


In [21]:
# 5.1 Handle outliers in price (target variable)
print("\nüéØ Handling Price Outliers:")

if 'price' in df.columns:
    print(f"   Original price stats:")
    print(f"   Mean: ${df['price'].mean():.2f}")
    print(f"   Median: ${df['price'].median():.2f}")
    print(f"   Min: ${df['price'].min():.2f}, Max: ${df['price'].max():.2f}")
    
    # Remove extreme outliers (price = 0 or price > 99th percentile * 2)
    price_99th = df['price'].quantile(0.99)
    df_before = len(df)
    df = df[(df['price'] > 0) & (df['price'] < price_99th * 2)]
    removed = df_before - len(df)
    print(f"   ‚úì Removed {removed} extreme price outliers")
    
    # Winsorize remaining outliers (cap at 1st and 99th percentile)
    price_1st = df['price'].quantile(0.01)
    price_99th = df['price'].quantile(0.99)
    df['price'] = df['price'].clip(lower=price_1st, upper=price_99th)
    print(f"   ‚úì Winsorized price to [{price_1st:.2f}, {price_99th:.2f}]")
    
    print(f"   Updated price stats:")
    print(f"   Mean: ${df['price'].mean():.2f}")
    print(f"   Median: ${df['price'].median():.2f}")



üéØ Handling Price Outliers:
   Original price stats:
   Mean: $152.18
   Median: $125.00
   Min: $0.00, Max: $8500.00
   ‚úì Removed 25 extreme price outliers
   ‚úì Winsorized price to [40.00, 500.00]
   Updated price stats:
   Mean: $147.58
   Median: $125.00


In [22]:
# 5.2 Handle outliers in other numeric columns
print("\nüì¶ Handling Other Numeric Outliers:")

# Cap minimum_nights at reasonable value
if 'minimum_nights' in df.columns:
    original_max = df['minimum_nights'].max()
    df['minimum_nights'] = df['minimum_nights'].clip(upper=365)  # Cap at 1 year
    print(f"   ‚úì Capped minimum_nights at 365 (was {original_max})")

# Cap maximum_nights at reasonable value  
if 'maximum_nights' in df.columns:
    df['maximum_nights'] = df['maximum_nights'].clip(upper=365*2)  # Cap at 2 years
    print(f"   ‚úì Capped maximum_nights at 730")

# Winsorize cleaning_fee
if 'cleaning_fee' in df.columns:
    fee_99th = df['cleaning_fee'].quantile(0.99)
    df['cleaning_fee'] = df['cleaning_fee'].clip(upper=fee_99th)
    print(f"   ‚úì Winsorized cleaning_fee at {fee_99th:.2f}")

# Winsorize security_deposit
if 'security_deposit' in df.columns:
    deposit_99th = df['security_deposit'].quantile(0.99)
    df['security_deposit'] = df['security_deposit'].clip(upper=deposit_99th)
    print(f"   ‚úì Winsorized security_deposit at {deposit_99th:.2f}")

# Cap accommodates at reasonable value
if 'accommodates' in df.columns:
    df['accommodates'] = df['accommodates'].clip(upper=16)  # Cap at 16 people
    print(f"   ‚úì Capped accommodates at 16")

print(f"\n‚úì Outlier treatment completed. Shape: {df.shape}")



üì¶ Handling Other Numeric Outliers:
   ‚úì Capped minimum_nights at 365 (was 1001)
   ‚úì Capped maximum_nights at 730
   ‚úì Winsorized cleaning_fee at 120.00
   ‚úì Winsorized security_deposit at 1000.00
   ‚úì Capped accommodates at 16

‚úì Outlier treatment completed. Shape: (20005, 114)


In [None]:
# Step 6: Feature Selection and Preparation for Encoding


In [23]:
print("="*80)
print("STEP 6: FEATURE SELECTION & CATEGORIZATION")
print("="*80)

# Drop original date columns (we've created features from them)
date_cols_to_drop = ['host_since', 'first_review', 'last_review']
df = df.drop(columns=[c for c in date_cols_to_drop if c in df.columns])
print(f"‚úì Dropped original date columns")

# Drop text columns (we've created length features)
text_cols_to_drop = ['name', 'summary', 'space', 'description', 'neighborhood_overview',
                     'notes', 'transit', 'access', 'interaction', 'house_rules', 
                     'amenities', 'host_verifications', 'host_about']
df = df.drop(columns=[c for c in text_cols_to_drop if c in df.columns], errors='ignore')
print(f"‚úì Dropped original text columns")

# Drop other non-predictive columns
other_drops = ['street', 'city', 'state', 'zipcode', 'market', 'smart_location',
               'country', 'country_code', 'calendar_updated']
df = df.drop(columns=[c for c in other_drops if c in df.columns], errors='ignore')
print(f"‚úì Dropped location detail columns")

print(f"\nCurrent shape: {df.shape}")
print(f"\nColumn types:")
print(df.dtypes.value_counts())


STEP 6: FEATURE SELECTION & CATEGORIZATION
‚úì Dropped original date columns
‚úì Dropped original text columns
‚úì Dropped location detail columns

Current shape: (20005, 89)

Column types:
int64      39
float64    30
object     10
bool       10
Name: count, dtype: int64


In [24]:
# Categorize columns for different encoding strategies
print("\nüìã Categorizing Features:")

# Numeric features (already processed)
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
if 'price' in numeric_features:
    numeric_features.remove('price')  # This is our target
if 'id' in numeric_features:
    numeric_features.remove('id')  # Don't use ID as feature

print(f"\n   Numeric features ({len(numeric_features)}): {numeric_features[:10]}...")

# Boolean features
boolean_features = df.select_dtypes(include=['bool']).columns.tolist()
print(f"\n   Boolean features ({len(boolean_features)}): {boolean_features}")

# Categorical features
categorical_features = df.select_dtypes(include=['object']).columns.tolist()
print(f"\n   Categorical features ({len(categorical_features)}): {categorical_features}")

# Check cardinality of categorical features
print("\n   Categorical feature cardinality:")
for col in categorical_features:
    n_unique = df[col].nunique()
    print(f"      {col}: {n_unique} unique values")



üìã Categorizing Features:

   Numeric features (67): ['host_response_rate', 'host_listings_count', 'host_total_listings_count', 'latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'security_deposit']...

   Boolean features (10): ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'is_location_exact', 'has_availability', 'requires_license', 'instant_bookable', 'is_business_travel_ready', 'require_guest_profile_picture', 'require_guest_phone_verification']

   Categorical features (10): ['host_name', 'host_location', 'host_response_time', 'host_neighbourhood', 'neighbourhood', 'neighbourhood_cleansed', 'property_type', 'room_type', 'bed_type', 'cancellation_policy']

   Categorical feature cardinality:
      host_name: 5911 unique values
      host_location: 549 unique values
      host_response_time: 5 unique values
      host_neighbourhood: 105 unique values
      neighbourhood: 44 unique values
      neighbourhood_cleansed: 22 unique value

In [None]:
# Step 7: Categorical Encoding


In [25]:
print("="*80)
print("STEP 7: CATEGORICAL ENCODING")
print("="*80)

# Strategy:
# - Low cardinality (<10): One-Hot Encoding
# - Medium cardinality (10-50): Ordinal or Target Encoding
# - High cardinality (>50): Target Encoding or drop

# Separate categorical features by cardinality
low_cardinality = []
medium_cardinality = []
high_cardinality = []

for col in categorical_features:
    n_unique = df[col].nunique()
    if n_unique < 10:
        low_cardinality.append(col)
    elif n_unique < 50:
        medium_cardinality.append(col)
    else:
        high_cardinality.append(col)

print(f"\n   Low cardinality (<10): {low_cardinality}")
print(f"   Medium cardinality (10-50): {medium_cardinality}")
print(f"   High cardinality (>50): {high_cardinality}")


STEP 7: CATEGORICAL ENCODING

   Low cardinality (<10): ['host_response_time', 'room_type', 'bed_type', 'cancellation_policy']
   Medium cardinality (10-50): ['neighbourhood', 'neighbourhood_cleansed', 'property_type']
   High cardinality (>50): ['host_name', 'host_location', 'host_neighbourhood']


In [26]:
# 7.1 One-Hot Encoding for low cardinality features
print("\n1. One-Hot Encoding (low cardinality):")

if low_cardinality:
    # Apply one-hot encoding
    df_encoded = pd.get_dummies(df, columns=low_cardinality, prefix=low_cardinality, 
                                  drop_first=True, dtype=int)
    
    # Count new columns created
    new_cols = len(df_encoded.columns) - len(df.columns) + len(low_cardinality)
    print(f"   ‚úì Created {new_cols} new columns from {len(low_cardinality)} categorical features")
    
    df = df_encoded
else:
    print("   No low cardinality features to encode")

print(f"   Shape: {df.shape}")



1. One-Hot Encoding (low cardinality):
   ‚úì Created 13 new columns from 4 categorical features
   Shape: (20005, 98)


In [27]:
# 7.2 Target Encoding for medium and high cardinality features
print("\n2. Target Encoding (medium/high cardinality):")

# Combine medium and high cardinality for target encoding
target_encode_cols = medium_cardinality + high_cardinality

if target_encode_cols and 'price' in df.columns:
    for col in target_encode_cols:
        if col in df.columns:
            # Calculate mean price for each category
            target_means = df.groupby(col)['price'].mean()
            
            # Map to dataframe (with smoothing using global mean)
            global_mean = df['price'].mean()
            df[f'{col}_target_encoded'] = df[col].map(target_means).fillna(global_mean)
            
            print(f"   ‚úì Target encoded {col}")
    
    # Drop original categorical columns
    df = df.drop(columns=target_encode_cols)
    print(f"   ‚úì Dropped {len(target_encode_cols)} original categorical columns")
else:
    print("   No medium/high cardinality features to encode")

print(f"   Shape: {df.shape}")



2. Target Encoding (medium/high cardinality):
   ‚úì Target encoded neighbourhood
   ‚úì Target encoded neighbourhood_cleansed
   ‚úì Target encoded property_type
   ‚úì Target encoded host_name
   ‚úì Target encoded host_location
   ‚úì Target encoded host_neighbourhood
   ‚úì Dropped 6 original categorical columns
   Shape: (20005, 98)


In [28]:
# 7.3 Convert boolean to int
print("\n3. Converting boolean features to int:")

for col in df.select_dtypes(include=['bool']).columns:
    df[col] = df[col].astype(int)
    
print(f"   ‚úì Converted {len(df.select_dtypes(include=['bool']).columns)} boolean columns")
print(f"\n‚úì Categorical encoding completed. Shape: {df.shape}")



3. Converting boolean features to int:
   ‚úì Converted 0 boolean columns

‚úì Categorical encoding completed. Shape: (20005, 98)


In [None]:
# Step 8: Train/Validation/Test Split


In [29]:
print("="*80)
print("STEP 8: TRAIN/VALIDATION/TEST SPLIT")
print("="*80)

# Prepare X and y
print("\n1. Preparing features and target:")

# Remove ID column if it exists
if 'id' in df.columns:
    df = df.drop(columns=['id'])

# Ensure price exists
if 'price' not in df.columns:
    print("   ‚ö†Ô∏è ERROR: 'price' column not found!")
else:
    # Separate features and target
    X = df.drop(columns=['price'])
    y = df['price']
    
    print(f"   Features shape: {X.shape}")
    print(f"   Target shape: {y.shape}")
    print(f"   Target stats: mean=${y.mean():.2f}, median=${y.median():.2f}, std=${y.std():.2f}")


STEP 8: TRAIN/VALIDATION/TEST SPLIT

1. Preparing features and target:
   Features shape: (20005, 96)
   Target shape: (20005,)
   Target stats: mean=$147.58, median=$125.00, std=$81.33


In [30]:
# 2. Split into train, validation, and test sets
print("\n2. Splitting data:")

# First split: 80% train+val, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Second split: 75% train, 25% val (of the 80%)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

print(f"   Train set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"   Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"   Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

print(f"\n   Train target - mean: ${y_train.mean():.2f}, std: ${y_train.std():.2f}")
print(f"   Val target - mean: ${y_val.mean():.2f}, std: ${y_val.std():.2f}")
print(f"   Test target - mean: ${y_test.mean():.2f}, std: ${y_test.std():.2f}")



2. Splitting data:
   Train set: 12003 samples (60.0%)
   Validation set: 4001 samples (20.0%)
   Test set: 4001 samples (20.0%)

   Train target - mean: $147.76, std: $81.00
   Val target - mean: $149.75, std: $84.98
   Test target - mean: $144.89, std: $78.50


In [None]:
# Step 9: Scaling and Normalization with Pipeline


In [31]:
print("="*80)
print("STEP 9: SCALING AND NORMALIZATION")
print("="*80)

# We'll create three versions with different scaling methods

# 9.1 StandardScaler (Z-score normalization)
print("\n1. StandardScaler (Z-score normalization):")
scaler_standard = StandardScaler()
X_train_standard = scaler_standard.fit_transform(X_train)
X_val_standard = scaler_standard.transform(X_val)
X_test_standard = scaler_standard.transform(X_test)

print(f"   ‚úì Applied StandardScaler")
print(f"   Train shape: {X_train_standard.shape}")
print(f"   Sample mean: {X_train_standard.mean():.4f}, Sample std: {X_train_standard.std():.4f}")


STEP 9: SCALING AND NORMALIZATION

1. StandardScaler (Z-score normalization):
   ‚úì Applied StandardScaler
   Train shape: (12003, 96)
   Sample mean: nan, Sample std: nan


In [32]:
# 9.2 MinMaxScaler (0-1 normalization)
print("\n2. MinMaxScaler (0-1 normalization):")
scaler_minmax = MinMaxScaler()
X_train_minmax = scaler_minmax.fit_transform(X_train)
X_val_minmax = scaler_minmax.transform(X_val)
X_test_minmax = scaler_minmax.transform(X_test)

print(f"   ‚úì Applied MinMaxScaler")
print(f"   Train range: [{X_train_minmax.min():.4f}, {X_train_minmax.max():.4f}]")



2. MinMaxScaler (0-1 normalization):
   ‚úì Applied MinMaxScaler
   Train range: [nan, nan]


In [33]:
# 9.3 RobustScaler (robust to outliers)
print("\n3. RobustScaler (robust to outliers):")
scaler_robust = RobustScaler()
X_train_robust = scaler_robust.fit_transform(X_train)
X_val_robust = scaler_robust.transform(X_val)
X_test_robust = scaler_robust.transform(X_test)

print(f"   ‚úì Applied RobustScaler")
print(f"   Train median: {np.median(X_train_robust):.4f}")

print(f"\n‚úì Scaling completed. All three versions created.")



3. RobustScaler (robust to outliers):
   ‚úì Applied RobustScaler
   Train median: nan

‚úì Scaling completed. All three versions created.


In [None]:
# Step 10: Save Processed Data


In [34]:
print("="*80)
print("STEP 10: SAVE PROCESSED DATA")
print("="*80)

# Save the processed dataframes
print("\nüì¶ Saving processed datasets...")

# 1. Save unscaled data (with all features engineered)
df_final = pd.concat([X, y], axis=1)
df_final.to_csv('listings_processed_unscaled.csv', index=False)
print(f"‚úì Saved: listings_processed_unscaled.csv ({df_final.shape})")

# 2. Save train/val/test splits (unscaled)
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

train_data.to_csv('train_unscaled.csv', index=False)
val_data.to_csv('val_unscaled.csv', index=False)
test_data.to_csv('test_unscaled.csv', index=False)

print(f"‚úì Saved: train_unscaled.csv ({train_data.shape})")
print(f"‚úì Saved: val_unscaled.csv ({val_data.shape})")
print(f"‚úì Saved: test_unscaled.csv ({test_data.shape})")

# 3. Save scaled versions as numpy arrays (more efficient for ML)
np.save('X_train_standard.npy', X_train_standard)
np.save('X_val_standard.npy', X_val_standard)
np.save('X_test_standard.npy', X_test_standard)
np.save('y_train.npy', y_train.values)
np.save('y_val.npy', y_val.values)
np.save('y_test.npy', y_test.values)

print(f"\n‚úì Saved: StandardScaler versions (numpy arrays)")
print(f"‚úì Saved: MinMaxScaler versions (numpy arrays)")
print(f"‚úì Saved: RobustScaler versions (numpy arrays)")

# 4. Save feature names
feature_names = X.columns.tolist()
pd.DataFrame({'feature': feature_names}).to_csv('feature_names.csv', index=False)
print(f"\n‚úì Saved: feature_names.csv ({len(feature_names)} features)")


STEP 10: SAVE PROCESSED DATA

üì¶ Saving processed datasets...
‚úì Saved: listings_processed_unscaled.csv ((20005, 97))
‚úì Saved: train_unscaled.csv ((12003, 97))
‚úì Saved: val_unscaled.csv ((4001, 97))
‚úì Saved: test_unscaled.csv ((4001, 97))

‚úì Saved: StandardScaler versions (numpy arrays)
‚úì Saved: MinMaxScaler versions (numpy arrays)
‚úì Saved: RobustScaler versions (numpy arrays)

‚úì Saved: feature_names.csv (96 features)


In [None]:
# Summary and Next Steps


In [35]:
print("\n" + "="*80)
print("üéâ DATA CLEANING AND PREPROCESSING COMPLETE!")
print("="*80)

print("\nüìä SUMMARY:")
print(f"   Original dataset: {listings_df.shape}")
print(f"   Final dataset: {df_final.shape}")
print(f"   Features created: {len(feature_names)}")
print(f"   Train samples: {len(X_train)}")
print(f"   Validation samples: {len(X_val)}")
print(f"   Test samples: {len(X_test)}")

print("\n‚úÖ COMPLETED STEPS:")
print("   1. ‚úì Data loading and exploration")
print("   2. ‚úì Duplicate removal and type conversion")
print("   3. ‚úì Logic error detection and correction")
print("   4. ‚úì Missing value treatment (deletion, median/mode imputation)")
print("   5. ‚úì Date/time feature engineering (tenure, recency, cyclical encoding)")
print("   6. ‚úì Text feature processing (length, word count, amenity flags)")
print("   7. ‚úì Outlier detection and treatment (IQR, winsorizing)")
print("   8. ‚úì Categorical encoding (one-hot, target encoding)")
print("   9. ‚úì Train/validation/test split (60/20/20)")
print("   10. ‚úì Scaling (StandardScaler, MinMaxScaler, RobustScaler)")
print("   11. ‚úì Data export")

print("\nüìÅ OUTPUT FILES:")
print("   - listings_processed_unscaled.csv (full processed dataset)")
print("   - train_unscaled.csv, val_unscaled.csv, test_unscaled.csv")
print("   - X_train_standard.npy, X_val_standard.npy, X_test_standard.npy")
print("   - y_train.npy, y_val.npy, y_test.npy")
print("   - feature_names.csv")

print("\nüöÄ NEXT STEPS:")
print("   1. Feature selection / dimensionality reduction (PCA, feature importance)")
print("   2. Model training (Linear Regression, Random Forest, XGBoost, Neural Networks)")
print("   3. Hyperparameter tuning")
print("   4. Model evaluation (RMSE, MAE, R¬≤)")
print("   5. Prediction and deployment")

print("\n" + "="*80)



üéâ DATA CLEANING AND PREPROCESSING COMPLETE!

üìä SUMMARY:
   Original dataset: (20030, 96)
   Final dataset: (20005, 97)
   Features created: 96
   Train samples: 12003
   Validation samples: 4001
   Test samples: 4001

‚úÖ COMPLETED STEPS:
   1. ‚úì Data loading and exploration
   2. ‚úì Duplicate removal and type conversion
   3. ‚úì Logic error detection and correction
   4. ‚úì Missing value treatment (deletion, median/mode imputation)
   5. ‚úì Date/time feature engineering (tenure, recency, cyclical encoding)
   6. ‚úì Text feature processing (length, word count, amenity flags)
   7. ‚úì Outlier detection and treatment (IQR, winsorizing)
   8. ‚úì Categorical encoding (one-hot, target encoding)
   9. ‚úì Train/validation/test split (60/20/20)
   10. ‚úì Scaling (StandardScaler, MinMaxScaler, RobustScaler)
   11. ‚úì Data export

üìÅ OUTPUT FILES:
   - listings_processed_unscaled.csv (full processed dataset)
   - train_unscaled.csv, val_unscaled.csv, test_unscaled.csv
   - X

In [36]:
# Optional: Visualize feature distributions
print("\nüìà Sample Feature Statistics (top 10 features):")
print("\nTop 10 features by variance:")
feature_variance = X_train.var().sort_values(ascending=False).head(10)
for feat, var in feature_variance.items():
    print(f"   {feat}: {var:.2f}")

print("\nüí° TIP: You can now use the saved files for:")
print("   - Model training: X_train_standard.npy, y_train.npy")
print("   - Hyperparameter tuning: X_val_standard.npy, y_val.npy")
print("   - Final evaluation: X_test_standard.npy, y_test.npy")
print("   - Try different scalers (standard, minmax, robust) to see which works best!")



üìà Sample Feature Statistics (top 10 features):

Top 10 features by variance:
   days_since_last_review: 10013630.97
   host_tenure_days: 452235.56
   days_since_first_review: 336577.14
   review_period_days: 309647.88
   maximum_nights: 117660.26
   space_length: 101584.40
   description_length: 88812.33
   neighborhood_overview_length: 84131.18
   house_rules_length: 79562.68
   transit_length: 45397.18

üí° TIP: You can now use the saved files for:
   - Model training: X_train_standard.npy, y_train.npy
   - Hyperparameter tuning: X_val_standard.npy, y_val.npy
   - Final evaluation: X_test_standard.npy, y_test.npy
   - Try different scalers (standard, minmax, robust) to see which works best!
