In [161]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

#website: https://insideairbnb.com/get-the-data/?utm_source=chatgpt.com

In [163]:
df = pd.read_csv("listings.csv")
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,13913,Holiday London DB Room Let-on going,54730,Alina,,Islington,51.56861,-0.1127,Private room,70.0,1,55,2025-08-21,0.3,2,331,10,
1,15400,Bright Chelsea Apartment. Chelsea!,60302,Philippa,,Kensington and Chelsea,51.4878,-0.16813,Entire home/apt,149.0,4,97,2025-04-05,0.51,1,199,1,
2,17402,Very Central Modern 3-Bed/2 Bath By Oxford St W1,67564,Liz,,Westminster,51.52195,-0.14094,Entire home/apt,411.0,3,56,2024-02-19,0.32,2,80,0,
3,24328,Battersea live/work artist house,41759,Joe,,Wandsworth,51.47072,-0.16266,Entire home/apt,,7,95,2025-07-05,0.53,1,294,1,
4,36274,Bright 1 bedroom apt off brick lane in Shoreditch,133271,Hendryks,,Tower Hamlets,51.52322,-0.06979,Entire home/apt,210.0,5,15,2025-09-06,0.09,2,323,6,


In [164]:
# Data Quality check 

df = pd.read_csv("listings.csv")

def data_quality_summary(df):

    #Shape of the dataset
    print("\nData shape")
    print(df.shape)

    #Data Type
    print("\nData Type")
    print(df.dtypes)
    
    #Missing values (%)
    print("\nMissing values (%)")
    print((100*df.isna().sum().sort_values(ascending = False))/len(df))

    #Duplicate values
    print("\nDuplicate values")
    print(df.duplicated().sum())

    #Data Description
    print("\nData Desctiption")
    print(df.describe())

data_quality_summary(df)


Data shape
(96871, 18)

Data Type
id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group               float64
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                             float64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
number_of_reviews_ltm               int64
license                           float64
dtype: object

Missing values (%)
license                           100.000000
neighbourhood_group               100.000000
price                              36.035552
last_review                        24.90

In [166]:
#converting to date and counting NaT rows

df["last_review"] = pd.to_datetime(df["last_review"].str.replace("$", ""), dayfirst = True, errors="coerce") #coerce will convert not convertible values to NaT (Not a Time)
NaT = df["last_review"].isna().sum()
print(NaT)

24122


In [184]:
# Number out outliers by price

Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1

outliers = df[(df["price"] < Q1 - 1.5 * IQR) | (df["price"] > Q3 + 1.5 * IQR)]

print(f"Outliers (%) ): {100*len(outliers)/len(df)}")

Outliers (%) ): 4.330501388444426


**Data Analysis**

In [188]:
#Number of hosts

Hosts = df["host_id"].nunique()

print(f"Totals hosts: {Hosts}")

Totals hosts: 55646


In [192]:
#Total Neighbourhood where Airbnbs are located

Count_Cities = df["neighbourhood"].nunique()

print(f"Total neighbourhoods: {Count_Cities}")

Total neighbourhoods: 33


In [196]:
#Neighbourhood where Airbnbs are located

Cities = df["neighbourhood"].unique()

print("Neighbourhoods")
print(Cities)

Neighbourhoods
['Islington' 'Kensington and Chelsea' 'Westminster' 'Wandsworth'
 'Tower Hamlets' 'Richmond upon Thames' 'Haringey'
 'Hammersmith and Fulham' 'Southwark' 'Barnet' 'Hounslow' 'Waltham Forest'
 'Brent' 'Camden' 'Hackney' 'Merton' 'Croydon' 'Lambeth' 'Havering'
 'Greenwich' 'Enfield' 'City of London' 'Ealing' 'Barking and Dagenham'
 'Lewisham' 'Newham' 'Hillingdon' 'Redbridge' 'Kingston upon Thames'
 'Bromley' 'Harrow' 'Bexley' 'Sutton']


In [200]:
# Room Types

Room = df["room_type"].unique()

print(f"Room Type: {Room}")

Room Type: ['Private room' 'Entire home/apt' 'Hotel room' 'Shared room']


In [31]:
# Avg price per room type

Avg_price_type = df.groupby(["room_type"], as_index = False).agg(avg_price=("price", "mean"))

Avg_price_type

Unnamed: 0,room_type,avg_price
0,Entire home/apt,279.347157
1,Hotel room,657.833333
2,Private room,121.71391
3,Shared room,96.910995


In [33]:
# Avg price of rooms in each neighbourhood

Avg_price_neighbourhood = (
                            df.groupby(["neighbourhood"], as_index = False)
                              .agg(avg_price=("price", "mean"))
                              .sort_values("avg_price", ascending=False)
                              .reset_index(drop=True)
)

Avg_price_neighbourhood

Unnamed: 0,neighbourhood,avg_price
0,Tower Hamlets,430.906199
1,City of London,354.389908
2,Lambeth,345.710741
3,Westminster,342.139405
4,Kensington and Chelsea,336.072148
5,Islington,217.546807
6,Camden,216.511547
7,Hammersmith and Fulham,199.188085
8,Wandsworth,198.431607
9,Richmond upon Thames,184.270936


In [35]:
# Avg price for each room type in each neighbourhood
 
Avgprice_city_room = df.groupby(["neighbourhood", "room_type"], as_index = False).agg(avg_price = ("price", "mean"))
Avgprice_city_room

Unnamed: 0,neighbourhood,room_type,avg_price
0,Barking and Dagenham,Entire home/apt,164.198556
1,Barking and Dagenham,Private room,50.609023
2,Barking and Dagenham,Shared room,111.000000
3,Barnet,Entire home/apt,165.381667
4,Barnet,Hotel room,
...,...,...,...
102,Wandsworth,Shared room,153.166667
103,Westminster,Entire home/apt,373.588714
104,Westminster,Hotel room,1088.454545
105,Westminster,Private room,155.258842


In [202]:
#Average availability throughout a year in each neigbourhood. 

Availability = (
                df.groupby(["neighbourhood"], as_index=False)
                  .agg(avg_availability=("availability_365", "mean"))
                  .sort_values("avg_availability")
                  .reset_index(drop=True)
)
Availability

Unnamed: 0,neighbourhood,avg_availability
0,Hackney,101.109608
1,Islington,111.094519
2,Lambeth,119.35183
3,Tower Hamlets,122.082742
4,Southwark,126.309954
5,Haringey,126.898383
6,Wandsworth,130.701309
7,Lewisham,131.793284
8,Richmond upon Thames,132.544961
9,Hammersmith and Fulham,135.436613


In [204]:
#Number of Airbnbs in each neighbourhood

No_Airbnbs = (
    df.groupby(["neighbourhood"])
      .agg(total_airbnbs=("price", "count"))
      .sort_values("total_airbnbs")
      .reset_index()  # keep neighbourhood as a column
)

No_Airbnbs


Unnamed: 0,neighbourhood,total_airbnbs
0,Sutton,363
1,Havering,430
2,City of London,436
3,Harrow,463
4,Kingston upon Thames,485
5,Bexley,496
6,Barking and Dagenham,545
7,Enfield,669
8,Bromley,673
9,Hillingdon,714


In [41]:
#We are assuming that guests gave review in the same month they stayed in an airbnb

avg_price_month = (
    df.groupby([df["last_review"].dt.to_period("M"), "room_type"])
      .agg(avg_price=("price", "mean"))
      .reset_index()
)

avg_price_month.rename(columns={"last_review": "month"}, inplace=True)


In [43]:
avg_price_month

Unnamed: 0,month,room_type,avg_price
0,2011-07,Private room,
1,2012-03,Private room,781.000000
2,2012-05,Private room,41.000000
3,2012-07,Private room,
4,2012-08,Entire home/apt,
...,...,...,...
357,2025-08,Shared room,152.920000
358,2025-09,Entire home/apt,201.010956
359,2025-09,Hotel room,198.000000
360,2025-09,Private room,69.559445


In [220]:
# For which month the prices are highest, lowest and what's the average in each neightbourhood according to each room type

price_summary = (
    df.groupby(["neighbourhood", "room_type"])
      .agg(
          min_price=("price", "min"),
          max_price=("price", "max"),
          avg_price=("price", "mean")
      ).reset_index()
)
                                           
price_summary

Unnamed: 0,neighbourhood,room_type,min_price,max_price,avg_price
0,Barking and Dagenham,Entire home/apt,40.0,913.0,164.198556
1,Barking and Dagenham,Private room,18.0,350.0,50.609023
2,Barking and Dagenham,Shared room,100.0,122.0,111.000000
3,Barnet,Entire home/apt,27.0,3500.0,165.381667
4,Barnet,Hotel room,,,
...,...,...,...,...,...
102,Wandsworth,Shared room,16.0,489.0,153.166667
103,Westminster,Entire home/apt,45.0,15143.0,373.588714
104,Westminster,Hotel room,188.0,1490.0,1088.454545
105,Westminster,Private room,18.0,10000.0,155.258842


In [225]:
# This will display entire result

print(price_summary.to_string())

              neighbourhood        room_type  min_price  max_price    avg_price
0      Barking and Dagenham  Entire home/apt       40.0      913.0   164.198556
1      Barking and Dagenham     Private room       18.0      350.0    50.609023
2      Barking and Dagenham      Shared room      100.0      122.0   111.000000
3                    Barnet  Entire home/apt       27.0     3500.0   165.381667
4                    Barnet       Hotel room        NaN        NaN          NaN
5                    Barnet     Private room       21.0     4800.0    78.354717
6                    Barnet      Shared room       52.0       52.0    52.000000
7                    Bexley  Entire home/apt       45.0     4800.0   187.541833
8                    Bexley     Private room       18.0      265.0    50.024490
9                     Brent  Entire home/apt       36.0    30812.0   223.972536
10                    Brent     Private room       17.0     6309.0    98.646699
11                    Brent      Shared 

In [257]:
#Number of reviews in the last 12 months and overall

reviews = (
                df.groupby(["name"])
                 .agg(reviews_ltm = ("number_of_reviews_ltm", "sum"), 
                      total_reviews=("number_of_reviews", "sum"))
                 .sort_values("reviews_ltm", ascending = False).reset_index()


)

reviews

Unnamed: 0,name,reviews_ltm,total_reviews
0,"Design Studio in Tower Hill, London",390,478
1,Private Double Room in Warren Street,260,1144
2,Double Room+ Ensuite,246,1902
3,West Kensington En Suite | Easy City Access,241,441
4,Brick Lane | Private En-Suite,227,241
...,...,...,...
93336,Lovely family home by Richmond Park,0,1
93337,Lovely family home close to Richmond.,0,3
93338,Lovely family home close to Wimbledon Park,0,0
93339,Lovely family home in Acton,0,8
