In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf
import statsmodels

import seaborn as sn

In [2]:
# read Inside Airbnb dataframe
df_2020 = pd.read_csv('2020IA.csv',low_memory=False)
df_2022 = pd.read_csv('2022IA.csv',low_memory=False)

In [3]:
# check some example numbers
print(df_2020.head(3))
print(df_2022.head(3))

        id                         listing_url       scrape_id last_scraped  \
0  11551.0  https://www.airbnb.com/rooms/11551  20200824024753   2020-08-26   
1  13913.0  https://www.airbnb.com/rooms/13913  20200824024753   2020-08-27   
2  15400.0  https://www.airbnb.com/rooms/15400  20200824024753   2020-08-26   

                                         name  \
0  Arty and Bright London Apartment in Zone 2   
1         Holiday London DB Room Let-on going   
2         Bright Chelsea  Apartment. Chelsea!   

                                         description  \
0  Unlike most rental apartments out there my fla...   
1  My bright double bedroom with a large window h...   
2  Lots of windows and light.  St Luke's Gardens ...   

                               neighborhood_overview  \
0  Not even 10 minutes by metro from Victoria Sta...   
1  Finsbury Park is a friendly melting pot commun...   
2                                     It is Chelsea.   

                                    

In [4]:
# print the shape of the DataFrame
print(df_2020.shape)
print(df_2022.shape)

(74188, 74)
(66640, 18)


In [5]:
# print the column names
print(df_2020.columns)
print(df_2022.columns)

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
       'neighborhood_overview', 'picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_upd

In [7]:
# pick the columns that we need
df_2020_pick = df_2020.loc[:,['id', 'price', 'number_of_reviews_ltm','reviews_per_month']]
df_2022_pick = df_2022.loc[:,['id', 'room_type', 'price', 'latitude', 'longitude', 'minimum_nights', 'reviews_per_month']]

In [8]:
# rename the columns
df_2020_rename = df_2020_pick.rename(columns = {'price':'price_2020',
                                                'number_of_reviews_ltm':'number_of_reviews_ltm_2020',
                                                'reviews_per_month':'reviews_per_month_2020'})
df_2022_rename = df_2022_pick.rename(columns = {'price':'price_2022',
                                                'reviews_per_month':'reviews_per_month_2022'})
print(df_2020_rename.columns)
print(df_2022_rename.columns)

Index(['id', 'price_2020', 'number_of_reviews_ltm_2020',
       'reviews_per_month_2020'],
      dtype='object')
Index(['id', 'room_type', 'price_2022', 'latitude', 'longitude',
       'minimum_nights', 'reviews_per_month_2022'],
      dtype='object')


In [9]:
# join dataframes
df = pd.merge(df_2020_rename, df_2022_rename, how='inner', on=['id'])

In [10]:
# compair different room types
cnt_room_types = df.groupby('room_type')['id'].count()
print(cnt_room_types)

room_type
Entire home/apt    23002
Hotel room           252
Private room       19790
Shared room          353
Name: id, dtype: int64


In [11]:
#delete $ and change datatype
df['price_2020'] = df['price_2020'].mask(df['price_2020'].str.contains('\,')==True,df['price_2020'].str.replace(',',''))
df['price_2020'] = df['price_2020'].str.split('$').str[1]
df['price_2020'] = df['price_2020'].astype('float64')
df.dtypes

id                            float64
price_2020                    float64
number_of_reviews_ltm_2020    float64
reviews_per_month_2020        float64
room_type                      object
price_2022                      int64
latitude                      float64
longitude                     float64
minimum_nights                  int64
reviews_per_month_2022        float64
dtype: object

In [12]:
# delete NaN
df['reviews_per_month_2020'] = np.nan_to_num(df['reviews_per_month_2020'])
df['reviews_per_month_2022'] = np.nan_to_num(df['reviews_per_month_2022'])

In [13]:
# data cleaning
df_cleaned = df[
    (df["number_of_reviews_ltm_2020"] != 0) & 
    (df["reviews_per_month_2020"] != 0) & 
    (df["reviews_per_month_2022"] != 0) & 
    (df["minimum_nights"] <= 5)]

In [14]:
# create new columns
df_cleaned.loc[:,'price_increase']=(df_cleaned.price_2022 - df_cleaned.price_2020)/df_cleaned.price_2020
df_cleaned.loc[:,'reviews_per_month_increase']=(df_cleaned.reviews_per_month_2022 - df_cleaned.reviews_per_month_2020)/df_cleaned.reviews_per_month_2020
df_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.loc[:,'price_increase']=(df_cleaned.price_2022 - df_cleaned.price_2020)/df_cleaned.price_2020
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.loc[:,'reviews_per_month_increase']=(df_cleaned.reviews_per_month_2022 - df_cleaned.reviews_per_month_2020)/df_cleaned.reviews_per_month_2020


Unnamed: 0,id,price_2020,number_of_reviews_ltm_2020,reviews_per_month_2020,room_type,price_2022,latitude,longitude,minimum_nights,reviews_per_month_2022,price_increase,reviews_per_month_increase
0,15400.0,75.0,3.0,0.68,Entire home/apt,75,51.4878,-0.16813,3,0.6,0.0,-0.117647
1,17402.0,208.0,1.0,0.37,Entire home/apt,307,51.52195,-0.14094,4,0.34,0.475962,-0.081081
4,33332.0,65.0,1.0,0.05,Private room,65,51.46416,-0.32554,2,0.09,0.0,0.8
5,36299.0,195.0,4.0,0.64,Entire home/apt,195,51.48085,-0.28086,3,0.64,0.0,0.0
6,36660.0,72.0,19.0,4.36,Private room,72,51.58478,-0.16057,2,3.93,0.0,-0.098624


In [15]:
# data cleaning again
df_cleaned2 = df_cleaned[(df_cleaned["price_increase"] != 0)]

In [16]:
df_pick = df_cleaned2.loc[:,['latitude', 'longitude', 'price_increase', 'reviews_per_month_increase']]
print(df_pick)

       latitude  longitude  price_increase  reviews_per_month_increase
1      51.52195   -0.14094        0.475962                   -0.081081
8      51.50681   -0.23345        0.068966                   -0.127451
15     51.61492   -0.25632        2.121099                   -0.190476
16     51.49351   -0.25568        2.571429                   -0.142857
17     51.50191   -0.10204        0.144928                   -0.177215
...         ...        ...             ...                         ...
43261  51.50264   -0.27684       -0.038462                    0.380000
43263  51.49435   -0.18978        0.035088                    0.935000
43283  51.51762   -0.13637        0.416000                   -0.260000
43349  51.49202   -0.14395        4.405405                   28.150000
43353  51.55142   -0.04937       10.166667                   -0.310000

[9427 rows x 4 columns]
