# Airbnb London – EDA

This notebook explores London Airbnb listing prices, cleaning raw data and visualizing key drivers.

**Steps:** Load → Clean (if needed) → Explore distributions → Group comparisons → Relationships → Save figures.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

plt.rcParams['figure.figsize'] = (10, 5)
pd.set_option('display.max_columns', 100)
sns.set()


In [None]:
RAW = Path('../data/raw/listings.csv')
CLEAN = Path('../data/processed/listings_clean.csv')
FIGS = Path('../reports/figures')
FIGS.mkdir(parents=True, exist_ok=True)

RAW, CLEAN, FIGS

In [None]:
if CLEAN.exists():
    df = pd.read_csv(CLEAN)
    print('Loaded cleaned data')
else:
    df = pd.read_csv(RAW)
    # Quick inline cleaning consistent with src/prepare_data.py
    df.columns = (df.columns
        .str.strip().str.lower().str.replace(' ', '_').str.replace('/', '_'))
    if 'price' in df.columns:
        df['price_gbp'] = (df['price'].astype(str)
            .str.replace('[£$,]', '', regex=True)
            .str.replace(' ', '', regex=False)
            .replace({'': np.nan}).astype(float))
    if 'host_since' in df.columns:
        df['host_since'] = pd.to_datetime(df['host_since'], errors='coerce')
        df['host_tenure_days'] = (pd.Timestamp.today().normalize() - df['host_since']).dt.days
    for col in ['minimum_nights','availability_365','number_of_reviews','review_scores_rating']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    if 'amenities' in df.columns:
        df['amenity_count'] = df['amenities'].fillna('[]').astype(str).str.count(',') + 1
        df.loc[df['amenities'].isna(), 'amenity_count'] = 0
    if 'host_is_superhost' in df.columns:
        df['host_is_superhost_bin'] = df['host_is_superhost'].astype(str).str.lower().map({'t':1,'true':1,'y':1,'yes':1,'f':0,'false':0,'n':0,'no':0}).fillna(0)
    if 'instant_bookable' in df.columns:
        df['instant_bookable_bin'] = df['instant_bookable'].astype(str).str.lower().map({'t':1,'true':1,'y':1,'yes':1,'f':0,'false':0,'n':0,'no':0}).fillna(0)
    if 'price_gbp' in df.columns:
        df = df[df['price_gbp'].between(10, 1000)]
df.head()

In [None]:
print('Rows:', len(df))
display(df.sample(5, random_state=42))
df.describe(include='all').T.head(20)

In [None]:
missing = df.isna().mean().sort_values(ascending=False)
missing.head(20)

In [None]:
ax = df['price_gbp'].plot(kind='hist', bins=50, edgecolor='white')
ax.set_title('Price Distribution (GBP)')
ax.set_xlabel('Price (GBP)')
plt.tight_layout()
plt.savefig(FIGS / 'price_hist.png', dpi=150)
plt.show()

In [None]:
if 'room_type' in df.columns:
    order = df.groupby('room_type')['price_gbp'].median().sort_values().index
    ax = sns.boxplot(data=df, x='room_type', y='price_gbp', order=order)
    ax.set_title('Price by Room Type')
    ax.set_xlabel('Room Type')
    ax.set_ylabel('Price (GBP)')
    plt.tight_layout()
    plt.savefig(FIGS / 'price_by_room_type.png', dpi=150)
    plt.show()
else:
    print('room_type column not found.')

In [None]:
for nb_col in ['neighbourhood', 'neighbourhood_cleansed']:
    if nb_col in df.columns:
        top = (df.groupby(nb_col)['price_gbp']
               .median()
               .sort_values(ascending=False)
               .head(15))
        ax = top.plot(kind='bar')
        ax.set_title(f'Top 15 {nb_col} by Median Price')
        ax.set_ylabel('Median Price (GBP)')
        plt.tight_layout()
        plt.savefig(FIGS / f'top15_{nb_col}_median_price.png', dpi=150)
        plt.show()
        break
else:
    print('No neighbourhood column found.')

In [None]:
pairs = [
    ('number_of_reviews', 'price_gbp'),
    ('review_scores_rating', 'price_gbp'),
    ('amenity_count', 'price_gbp'),
    ('availability_365', 'price_gbp'),
]
for x, y in pairs:
    if x in df.columns and y in df.columns:
        ax = sns.scatterplot(data=df, x=x, y=y, alpha=0.3)
        ax.set_title(f'{y} vs {x}')
        plt.tight_layout()
        plt.savefig(FIGS / f'{y}_vs_{x}.png', dpi=150)
        plt.show()

In [None]:
summary = {
    'price_median': float(df['price_gbp'].median()) if 'price_gbp' in df.columns else None,
    'price_iqr': (float(df['price_gbp'].quantile(0.75) - df['price_gbp'].quantile(0.25))
                  if 'price_gbp' in df.columns else None),
}
summary

In [None]:
if not Path('../data/processed/listings_clean.csv').exists() and 'price_gbp' in df.columns:
    df.to_csv('../data/processed/listings_clean.csv', index=False)
    print('Saved cleaned dataset to data/processed/listings_clean.csv')

## Conclusions (fill in with your findings)
- Summarize the **top 3 drivers** of price variation you observed.
- Quantify premium by room type and by top neighbourhoods.
- Note any data quality issues and how you handled them.
- Suggest **next steps** (geospatial, modeling, dashboard).