Import Libraries

In [2]:
import pandas as pd
from src.cleaner import clean_airbnb_data

Load data

In [3]:
file_path = "data/london_listing_raw.csv"
df_clean = clean_airbnb_data(file_path)

 Successfully loaded data from data/london_listing_raw.csv
 Shape: 96651 rows, 79 columns


In [4]:
df_clean.head(5)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,264776,https://www.airbnb.com/rooms/264776,20250610032232,2025-06-11,city scrape,Huge Four Bedroom Apartment,An extremely large and sunny four bedroom grou...,Catford is a well established London suburb. T...,https://a0.muscache.com/pictures/hosting/Hosti...,1389063,...,4.74,4.62,4.72,,t,11,11,0,0,0.51
1,264777,https://www.airbnb.com/rooms/264777,20250610032232,2025-06-11,city scrape,One Bedroom Apartment,Recently refurbished sunny one bedroom first f...,,https://a0.muscache.com/pictures/hosting/Hosti...,1389063,...,4.25,4.54,4.42,,t,11,11,0,0,0.22
2,264778,https://www.airbnb.com/rooms/264778,20250610032232,2025-06-11,city scrape,Two Bedroom Newly Refurbished Apartment,A large and sunny two bedroom second floor apa...,Catford is a well established London suburb. T...,https://a0.muscache.com/pictures/50662093/af12...,1389063,...,4.52,4.36,4.38,,t,11,11,0,0,0.43
3,264779,https://www.airbnb.com/rooms/264779,20250610032232,2025-06-11,city scrape,Refurbished Two Bedroom Apartment,A large and sunny two bedroom second floor apa...,Catford is a well established London suburb. T...,https://a0.muscache.com/pictures/50660860/e440...,1389063,...,4.61,4.5,4.47,,t,11,11,0,0,0.3
4,264780,https://www.airbnb.com/rooms/264780,20250610032232,2025-06-11,city scrape,Spacious refurbished 2 bedroom apt with balcony,Completely refurbished 2 bedroom apt to sleep ...,,https://a0.muscache.com/pictures/airflow/Hosti...,1389063,...,4.74,4.37,4.59,,t,11,11,0,0,0.35


Work on price column

In [5]:
#view price column
df_clean[['price']].head()

Unnamed: 0,price
0,$297.00
1,$98.00
2,$148.00
3,$144.00
4,$157.00


In [7]:
df_clean['price'].dtype

dtype('O')

In [8]:
# Remove currency symbols,commas and then convert to float
df_clean['price'] = (
    df_clean['price']
    .replace(r'[\$,]', '', regex=True)  # Use raw string to avoid escape issues
    .astype(float)                      # Convert string to float
)

In [9]:
#check price column 

df_clean['price'].head(5)

0    297.0
1     98.0
2    148.0
3    144.0
4    157.0
Name: price, dtype: float64

Work on date column

In [10]:
#check date column

df_clean[['last_review']].head(5)

Unnamed: 0,last_review
0,2025-05-28
1,2024-12-11
2,2025-05-01
3,2025-04-10
4,2024-12-29


In [11]:
df_clean['last_review'] = pd.to_datetime(df_clean['last_review'], errors='coerce')

In [12]:
df_clean[['last_review']].head(5)

Unnamed: 0,last_review
0,2025-05-28
1,2024-12-11
2,2025-05-01
3,2025-04-10
4,2024-12-29


Handle missing values

In [13]:
#Reviews per month
df_clean['reviews_per_month'] = df_clean['reviews_per_month'].fillna(0)

In [14]:
#Host_name
df_clean['host_name'] = df_clean['host_name'].fillna('Unknown')

In [16]:
#Neighourhood_group
df_clean = df_clean.dropna(subset=['neighbourhood_cleansed'])

Remove irrelevant columns

In [19]:
#Columns with low analytical value
cols_to_drop = ['license', 'scrape_id', 'neighbourhood_group_cleansed', 'neighbourhood']

df_clean = df_clean.drop(columns=cols_to_drop, errors='ignore')

print(f"New dataframe shape: {df_clean.shape}")

New dataframe shape: (96651, 75)


In [20]:
# Calculate percentage of missing values for each column
missing_percent = df_clean.isnull().mean().sort_values(ascending=False) * 100

# Display the result
print(missing_percent)

calendar_updated          100.000000
neighborhood_overview      56.562270
host_neighbourhood         52.104996
host_about                 48.977248
beds                       35.152249
                             ...    
minimum_maximum_nights      0.000000
maximum_maximum_nights      0.000000
minimum_nights_avg_ntm      0.000000
maximum_nights_avg_ntm      0.000000
reviews_per_month           0.000000
Length: 75, dtype: float64


In [21]:
# List columns with more than 70% missing values
high_null_cols = missing_percent[missing_percent > 70].index.tolist()

print("Columns with more than 70% missing values:")
print(high_null_cols)

Columns with more than 70% missing values:
['calendar_updated']


In [22]:
# Drop calendar_updated due to high missing data
df_clean = df_clean.drop(columns=['calendar_updated'])

print(f"Remaining columns after drop: {df_clean.shape[1]}")

Remaining columns after drop: 74


Save df_clean to csv

In [24]:
df_clean.to_csv("data/london_listing_clean.csv", index=False)