In [41]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected = True) # make the graph show in the notebook
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import plotly.express as px
import gzip

This is an exploratory analysis on Arbnb listings in Stockholm, the capital of Sweden, in 2025. The data is downloaded from Airbnb Inside. 

In [42]:
with gzip.open('calendar.csv.gz', 'rt') as f:
    calendar = pd.read_csv(f)
print('There are', calendar.date.nunique(), 'days and', calendar.listing_id.nunique(), 'different listings')
print('The start date of this dataset is', calendar.date.min(),'and the end date is' ,calendar.date.max())

calendar.head()

There are 365 days and 5223 different listings
The start date of this dataset is 2024-12-30 and the end date is 2025-12-29


Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,164448,2024-12-30,t,$950.00,,2,120
1,164448,2024-12-31,f,$950.00,,2,120
2,164448,2025-01-01,f,$950.00,,2,120
3,164448,2025-01-02,f,$950.00,,2,120
4,164448,2025-01-03,t,$950.00,,2,120


Let's start with the overall availability of listings in Stockholm as of the day of this analysis (2025-04-01).
Note that Figure 1 only illustrates to what extent the listings are booked at the time of analysis, which means the results can look very different if conducting a similar analysis later in 2025.

As shown in Figure 1, more than 60% percent of the listings have already been booked as of 1 April, 2025. 

In [52]:
df_ratios = calendar.available.value_counts(normalize = True)
df_ratios = df_ratios.reset_index()
df_ratios['status'] = df_ratios['available'].apply(lambda x: 'Reserved' if x == 'f' else 'Available')

fig1 = px.bar(df_ratios, x = 'status', y = 'proportion', 
              title= 'Figure 1: Overall  Availability of Listings in Stockholm in 2025 as of 1 April',
              labels = {'status': 'Status', 'proportion': ' ' },
              range_y = [0, .7])
fig1.update_layout(yaxis_tickformat = ".0%")
py.iplot(fig1)

In [72]:
# add a new column called 'busy'
# if a listing is not available, it takes on value 1 otherwise 0
new_calendar = calendar[['date', 'available']]
new_calendar['busy'] = new_calendar.available.map(lambda x:0 if x == 't' else 1)
new_calendar = new_calendar.groupby('date')['busy'].mean().reset_index()
new_calendar['date'] = pd.to_datetime(new_calendar['date'])

fig2 = px.line(new_calendar, x = 'date', y = 'busy', 
               title = 'Figure 2: Ratios of Booked Airbnb Listings in Stockholm in 2025',
               labels = {'date': '', 'busy': ''})
fig2.update_layout(yaxis_tickformat = '.0%')
fig2.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



Figure 2 shows the ratio of booked listings in Stockholm in 2025. The values are calculated by dividing the total number of booked listings by the sum of listings. It is clear that right before January, when most people have their Christmas break, almost all (~90%) the listings were reserved, while the percentage of booked listing continued to drop before it went up again in April. The ratio stays relatively stable until November, when there is yet another surge in bookings. 

In [None]:
import datetime as dt
calendar['date'] = pd.to_datetime(calendar['date'])
# calendar['price'] = calendar['price'].str.replace('$', '').str.replace(',', '').astype(float)
calendar['booked'] = calendar.available.map(lambda x:1 if x == 't' else 0)
mean_of_month = calendar.groupby(calendar['date'].dt.strftime('%B'), sort = False)['booked'].mean()
mean_of_month = mean_of_month.reset_index()

In [66]:
# visualization of average availability by month
fig3 = px.bar(mean_of_month, x = 'booked', y = 'date', orientation= 'h',
              title = 'Figure 3: Average Availability in Stockholm in 2025 by Month',
              labels = {'date': '', 'booked': 'Ratio of Available Listings'})
fig3.update_layout(xaxis_tickformat = ".0%")
fig3.show()

Higher values suggest that higher number of available listings. From the Figure 3 it is obvious that in the beginning of spring, there are plenty of available listings.

How about when people go out the most within a week? Figure 4 shows that on both Fridays and Saturdays, the availability is the lowest, and this partially aligns with the common knowledge that people go out on weekends. However, it is worth noticing that the difference among days is in fact very little. 

In [84]:
calendar['dayofweek'] = calendar.date.dt.day_name()
days = calendar['dayofweek'].unique().tolist()
df_week = price_week = calendar.groupby('dayofweek')['booked'].mean().reindex(days)
df_week = df_week.reset_index()
fig4 = px.line(df_week, x = 'dayofweek', y = 'booked', title= 'Figure 4: Average Availability by Day of Week in Stockholm in 2025', 
               labels = {'dayofweek': '', 'booked': 'Ratio of Available Listings'})
fig4.update_xaxes(tickangle= 45)
fig4.update_layout(yaxis_tickformat = '.1%')
fig4.show()

In [83]:
# listing data
listings = pd.read_csv('./listings.csv')
print('There are a total of', listings.id.nunique(), 'listings in Stockholm as of 2025.')

There are a total of 5223 listings in Stockholm as of 2025.


In [68]:
by_area = listings.groupby('neighbourhood').count()[['id']].sort_values('id', ascending= True)
by_area = by_area.reset_index()
fig5 = px.bar(by_area, x = 'neighbourhood', y = 'id', title= 'Figure 5: Number of Listings in Stockholm in 2025 by Area',
              labels = {'neighbourhood': 'Area', 'id': 'Counts'})
fig5.show()

The minimum price per night is \$ 105 and the maximum price per night is up to \$ 500000, which makes the initial visualization not very comprehensible. Thus, I removed some outliers whose price per night is greater than $ 10000, and this lead to a removal of 25 listings, which accounts for less than 1% of the listings in Stockholm.

As what is shown in *Figure 5*, the median price per night in each area does not differ much but some areas, such as Sodermalms and Norrmalms see a considerable number of listings whose price is much higher than the average. 

In [69]:
# distribution of price per night 
# min = 105 max = 500000
# remove outliers

listings_cleaned = listings[listings.price < 10000]
area_order = by_area['neighbourhood'].tolist()
area_order
fig6 = px.box(listings_cleaned, x = 'neighbourhood', y ='price', 
              title = 'Figure 6: Price of Arbnb Listing Per Night in Stockholm in 2025 by Area',
              labels = {'neighbourhood': 'Area', 'price': 'Price'},
              category_orders = {'neighbourhood': area_order})
fig6.show()


In [70]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
listings.price[listings['room_type'] == 'Private room'].max()

# main graph
fig_price_by_type = make_subplots(rows = 2, cols = 2)
# eliciting data
d_pv_room = listings_cleaned['price'][listings_cleaned.room_type == 'Private room']
d_entire = listings_cleaned['price'][listings_cleaned.room_type == 'Entire home/apt']
d_share = listings_cleaned['price'][listings_cleaned.room_type == 'Shared room']
d_hotel = listings_cleaned['price'][listings_cleaned.room_type == 'Hotel room']
# unify xlim
max_price_pv_entire = max(d_pv_room.max(), d_entire.max())
max_price_share_hotel = max(d_share.max(), d_hotel.max()) *1.1
# histogram
pv_room = go.Histogram(x = d_pv_room, name = 'Private Room', xbins = dict(size=100, end= max_price_pv_entire))
entire = go.Histogram(x = d_entire, name = 'Entire Home/Apt' ,xbins= dict(size=100, end= max_price_pv_entire))
share = go.Histogram(x = d_share, name = 'Share Room', xbins= dict(size = 100, end = max_price_share_hotel))
hotel = go.Histogram(x = d_hotel, name = 'Hotel Room', xbins= dict(end = max_price_share_hotel))
# append sub plots
fig_price_by_type.append_trace(pv_room, 1 ,1)
fig_price_by_type.append_trace(entire, 1, 2)
fig_price_by_type.append_trace(share, 2, 1)
fig_price_by_type.append_trace(hotel, 2, 2)
# update xlim
fig_price_by_type.update_xaxes(range=[0, max_price_pv_entire], row= 1, col= 1)
fig_price_by_type.update_xaxes(range=[0, max_price_pv_entire], row= 1, col= 2)
fig_price_by_type.update_xaxes(range=[0, max_price_share_hotel], row= 2, col= 1)
fig_price_by_type.update_xaxes(range=[0, max_price_share_hotel], row= 2, col= 2)
# adjust overall layout
fig_price_by_type.update_layout(height=600, width=800, 
                                title_text= 'Figure 7: Arbnb Price Distribution by Room Type in Stockholm in 2025')
fig_price_by_type.show()

In [None]:
import json
with gzip.open('listings.csv.gz') as file:
    listings_complete = pd.read_csv(file)
# data handling
listings_complete['amenities'] = listings_complete.amenities.apply(json.loads)
# get all the items
all_items = sum(listings_complete['amenities'], [])
am_counts = pd.Series(all_items).value_counts()
df_am = am_counts.reset_index()
df_am.columns = ['amenity', 'count']
# remove some items
none_essentitals = ['Smoke alarm', 'Hangers', 'Fire extinguisher', 'Hot water']
df_am_filtered = df_am[~df_am['amenity'].isin(none_essentitals)][0:20]

In [71]:
# visualization of the top 20 common amenities
fig7 = px.bar(df_am_filtered, x = 'amenity', y = 'count', 
              title = 'Figure 8: Top 20 Amenities in Arbnb Listings in Stockholm in 2025',
              labels = {'amenity': '', 'count': 'Count'})
fig7.show()
