# Data Understanding (EDA) - Prague Airbnb Listings

**Goal:** Explore the `listings.csv` dataset to understand its structure, identify potential issues (missing values, outliers, data types), and gain initial insights into factors influencing Airbnb prices in Prague.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
pd.set_option('display.float_format', lambda x: '%.2f' % x) # Format floats nicely

# Configure plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("notebook", font_scale=1.1)

print("Libraries imported and settings configured.")

Libraries imported and settings configured.


## 1. Load Data
Load the detailed listings data.

In [2]:
# Define the path
listings_path = '../data/raw/listings.csv'

# Load the dataset
try:
    df_listings = pd.read_csv(listings_path, low_memory=False)
    print(f"Successfully loaded {listings_path}")
    print(f"Shape: {df_listings.shape}")
except FileNotFoundError:
    print(f"Error: File not found at {listings_path}. Make sure you downloaded and unzipped it.")
except Exception as e:
    print(f"An error occurred during loading: {e}")

# Display first few rows
if 'df_listings' in locals():
    display(df_listings.head())

Successfully loaded ../data/raw/listings.csv
Shape: (10108, 79)


Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,availability_eoy,number_of_reviews_ly,estimated_occupancy_l365d,estimated_revenue_l365d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,23163,https://www.airbnb.com/rooms/23163,20250316041547,2025-03-16,city scrape,Residence Karolina - KAROL12,"Unique and elegant apartment rental in Prague,...",,https://a0.muscache.com/pictures/01bbe32c-3f13...,5282,https://www.airbnb.com/users/show/5282,Klara,2008-12-17,"Prague, Czechia","Hello, \r\nglad to see that you are interested...",within an hour,100%,100%,t,https://a0.muscache.com/im/pictures/user/b7309...,https://a0.muscache.com/im/pictures/user/b7309...,Josefov,72.0,82.0,"['email', 'phone']",t,t,,Praha 1,,50.08,14.42,Entire rental unit,Entire home/apt,4,1.0,1 bath,1.0,2.0,"[""Coffee maker"", ""Dishwasher"", ""Bed linens"", ""...","$2,918.00",1,365,1,7,60,731,1.4,663.6,,t,0,0,0,0,2025-03-16,31,1,0,0,1,6,17508.0,2010-09-20,2024-06-15,4.9,4.83,5.0,5.0,4.97,4.93,4.86,,t,70,69,0,0,0.18
1,23169,https://www.airbnb.com/rooms/23169,20250316041547,2025-03-16,city scrape,Residence Masna - Masna302,Masna studio offers a lot of space and privacy...,,https://a0.muscache.com/pictures/b450cf2a-8561...,5282,https://www.airbnb.com/users/show/5282,Klara,2008-12-17,"Prague, Czechia","Hello, \r\nglad to see that you are interested...",within an hour,100%,100%,t,https://a0.muscache.com/im/pictures/user/b7309...,https://a0.muscache.com/im/pictures/user/b7309...,Josefov,72.0,82.0,"['email', 'phone']",t,t,,Praha 1,,50.09,14.42,Entire rental unit,Entire home/apt,3,1.0,1 bath,1.0,2.0,"[""Patio or balcony"", ""Coffee maker"", ""Bed line...",,1,365,1,7,60,731,1.2,710.6,,t,7,13,13,13,2025-03-16,122,6,0,13,8,36,,2010-05-07,2024-11-08,4.74,4.6,4.83,4.81,4.87,4.97,4.7,,t,70,69,0,0,0.67
2,26755,https://www.airbnb.com/rooms/26755,20250316041547,2025-03-16,city scrape,Central Prague Old Town Top Floor,Big and beautiful new attic apartment in the v...,This apartment offers a fantastic location. Yo...,https://a0.muscache.com/pictures/miso/Hosting-...,113902,https://www.airbnb.com/users/show/113902,Daniel+Bea,2010-04-26,"Prague, Czechia",Hi! we are a sp/cz couple with 2 daughters (La...,within an hour,100%,98%,t,https://a0.muscache.com/im/pictures/user/8db01...,https://a0.muscache.com/im/pictures/user/8db01...,Staré Město,4.0,4.0,"['email', 'phone']",t,t,"Prague, Hlavní město Praha, Czechia",Praha 1,,50.09,14.43,Entire rental unit,Entire home/apt,4,1.5,1.5 baths,1.0,2.0,"[""AC - split type ductless system"", ""Coffee ma...","$1,582.00",3,700,3,4,1125,1125,3.1,1125.0,,t,3,7,24,173,2025-03-16,411,53,3,173,57,255,403410.0,2015-05-19,2025-03-07,4.94,4.95,4.92,4.93,4.96,4.93,4.9,,f,3,3,0,0,3.43
3,782440,https://www.airbnb.com/rooms/782440,20250316041547,2025-03-16,city scrape,"Central,Cheap&Lovely Miniapartment2",See all our offers on Airbnb!<br />www.airbnb....,,https://a0.muscache.com/pictures/11243352/ad21...,4102236,https://www.airbnb.com/users/show/4102236,Bonny,2012-11-09,"Prague, Czechia","We love traveling, culture, good food and meet...",within an hour,100%,80%,t,https://a0.muscache.com/im/users/4102236/profi...,https://a0.muscache.com/im/users/4102236/profi...,Žižkov,6.0,6.0,"['email', 'phone']",t,t,,Praha 3,,50.09,14.45,Private room in rental unit,Private room,2,1.0,1 private bath,1.0,2.0,"[""Coffee maker"", ""Bed linens"", ""Dishes and sil...",$860.00,3,60,1,3,60,60,3.0,60.0,,t,5,5,5,5,2025-03-16,414,52,5,5,52,255,219300.0,2013-01-04,2025-03-02,4.76,4.85,4.71,4.94,4.95,4.63,4.83,,f,6,3,3,0,2.79
4,782489,https://www.airbnb.com/rooms/782489,20250316041547,2025-03-16,city scrape,"Central,Cheap&Lovely Miniapartment3",See all our offers on Airbnb!<br />www.airbnb....,,https://a0.muscache.com/pictures/35821920/aee6...,4102236,https://www.airbnb.com/users/show/4102236,Bonny,2012-11-09,"Prague, Czechia","We love traveling, culture, good food and meet...",within an hour,100%,80%,t,https://a0.muscache.com/im/users/4102236/profi...,https://a0.muscache.com/im/users/4102236/profi...,Žižkov,6.0,6.0,"['email', 'phone']",t,t,,Praha 3,,50.09,14.45,Private room in rental unit,Private room,2,1.0,1 private bath,1.0,3.0,"[""Coffee maker"", ""Bed linens"", ""Dishes and sil...",$629.00,3,60,3,3,60,60,3.0,60.0,,t,3,3,3,3,2025-03-16,389,47,3,3,48,255,160395.0,2013-03-25,2025-03-01,4.69,4.79,4.58,4.89,4.9,4.59,4.73,,f,6,3,3,0,2.67


## 2. Initial Inspection
Get a first overview of the data structure, types, and basic statistics.

In [3]:
# Check DataFrame info (non-null counts, data types)
if 'df_listings' in locals():
    print("DataFrame Info:")
    df_listings.info(verbose=True, show_counts=True)

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10108 entries, 0 to 10107
Data columns (total 79 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            10108 non-null  int64  
 1   listing_url                                   10108 non-null  object 
 2   scrape_id                                     10108 non-null  int64  
 3   last_scraped                                  10108 non-null  object 
 4   source                                        10108 non-null  object 
 5   name                                          10108 non-null  object 
 6   description                                   9867 non-null   object 
 7   neighborhood_overview                         4638 non-null   object 
 8   picture_url                                   10108 non-null  object 
 9   host_id                                      

In [4]:
# Get descriptive statistics for all columns (numeric and object)
if 'df_listings' in locals():
    print("\nDescriptive Statistics (All Columns):")
    display(df_listings.describe(include='all').T) # Transpose for better readability


Descriptive Statistics (All Columns):


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,10108.0,,,,6.108089451672794e+17,5.380317870672694e+17,23163.0,32809559.25,7.446004972534433e+17,1.128375528908403e+18,1.3766879620670464e+18
listing_url,10108.0,10108.0,https://www.airbnb.com/rooms/1376687962067046501,1.0,,,,,,,
scrape_id,10108.0,,,,20250316041547.0,0.0,20250316041547.0,20250316041547.0,20250316041547.0,20250316041547.0,20250316041547.0
last_scraped,10108.0,1.0,2025-03-16,10108.0,,,,,,,
source,10108.0,2.0,city scrape,8870.0,,,,,,,
name,10108.0,9639.0,Hostel HOMEr - 12 bed mixed room,19.0,,,,,,,
description,9867.0,7895.0,In this quiet place in the middle of the actio...,33.0,,,,,,,
neighborhood_overview,4638.0,2919.0,The apartment is located in one of the best lo...,31.0,,,,,,,
picture_url,10108.0,9796.0,https://a0.muscache.com/pictures/799e1836-3bfb...,13.0,,,,,,,
host_id,10108.0,,,,212674181.73,211450878.15,5282.0,24870251.0,122509558.0,419022428.0,683899729.0


## 3. Target Variable Analysis (`price`)
Inspect the target variable we want to predict.

*   Check data type and clean if necessary.
*   Analyze distribution.
*   Check for missing values.

In [5]:
if 'df_listings' in locals():
    # Check initial data type
    print(f"Initial 'price' dtype: {df_listings['price'].dtype}")

    # Clean the price column (remove '$', ',', convert to numeric)
    # Check if it's object type first
    if df_listings['price'].dtype == 'object':
        print("Cleaning 'price' column...")
        # Make a copy to avoid SettingWithCopyWarning if needed later
        df_listings['price_cleaned'] = df_listings['price'].astype(str).str.replace('[$,]', '', regex=True)
        df_listings['price_cleaned'] = pd.to_numeric(df_listings['price_cleaned'], errors='coerce') # errors='coerce' turns problematic values into NaN
        print(f"Cleaned 'price_cleaned' dtype: {df_listings['price_cleaned'].dtype}")

        # Check how many values couldn't be converted (became NaN)
        original_non_null = df_listings['price'].notnull().sum()
        cleaned_non_null = df_listings['price_cleaned'].notnull().sum()
        print(f"Original non-null prices: {original_non_null}")
        print(f"Cleaned non-null prices: {cleaned_non_null}")
        print(f"Prices that failed conversion: {original_non_null - cleaned_non_null}")

        # Optional: Replace original price if cleaning was successful
        # df_listings['price'] = df_listings['price_cleaned']
        # df_listings.drop('price_cleaned', axis=1, inplace=True)
        # For now, let's keep price_cleaned separate for clarity
    else:
        # If already numeric, just copy to the new column name for consistency
         df_listings['price_cleaned'] = df_listings['price']


    # Check for missing values in the cleaned price
    missing_prices = df_listings['price_cleaned'].isnull().sum()
    print(f"\nMissing values in 'price_cleaned': {missing_prices} ({missing_prices / len(df_listings):.2%})")

Initial 'price' dtype: object
Cleaning 'price' column...
Cleaned 'price_cleaned' dtype: float64
Original non-null prices: 8808
Cleaned non-null prices: 8808
Prices that failed conversion: 0

Missing values in 'price_cleaned': 1300 (12.86%)
