In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
from sklearn.impute import KNNImputer
import datetime as dt

In [None]:
airbnb_data = pd.read_csv('AB_NYC_2019.csv')

In [None]:
airbnb_data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [None]:
airbnb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [None]:
airbnb_data.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,days_since_last_review,popularity_score,estimated_annual_revenue
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,38843,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0
mean,19017140.0,67620010.0,40.728949,-73.95217,152.720687,7.029962,23.274466,2018-10-04 01:47:23.910099456,1.09091,7.143982,112.781327,295.032089,32.442297,9905.284
min,2539.0,2438.0,40.49979,-74.24442,0.0,1.0,0.0,2011-03-28 00:00:00,0.0,1.0,0.0,0.0,0.0,0.0
25%,9471945.0,7822033.0,40.6901,-73.98307,69.0,1.0,1.0,2018-07-08 00:00:00,0.04,1.0,0.0,19.0,0.147929,0.0
50%,19677280.0,30793820.0,40.72307,-73.95568,106.0,3.0,5.0,2019-05-19 00:00:00,0.37,1.0,45.0,186.0,4.040404,2146.5
75%,29152180.0,107434400.0,40.763115,-73.936275,175.0,5.0,24.0,2019-06-23 00:00:00,1.58,2.0,227.0,365.0,90.719208,11257.5
max,36487240.0,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,2019-07-08 00:00:00,58.5,327.0,365.0,3024.0,100.0,1824818.0
std,10983110.0,78610970.0,0.05453,0.046157,240.15417,20.51055,44.550582,,1.597283,32.952519,131.622289,370.636506,42.295873,29503.01


In [None]:
# missing values
print(airbnb_data.isnull().sum())

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64


In [None]:
# convert 'last_review' to datetime
airbnb_data['last_review'] = pd.to_datetime(airbnb_data['last_review'], errors='coerce')

# calculate days of last review
today = pd.to_datetime('2019-07-08')
airbnb_data['days_since_last_review'] = (today - airbnb_data['last_review']).dt.days

# fill the missing values of days_since_last_review with 365 days
airbnb_data['days_since_last_review'].fillna(365, inplace=True)

# fill the missing values of reviews_per_month with 0 where number_of_reviews is 0
airbnb_data.loc[airbnb_data['number_of_reviews'] == 0, 'reviews_per_month'] = 0

# the remaining missing of reviews_per_month
imputer = KNNImputer(n_neighbors=5)
airbnb_data[['reviews_per_month']] = imputer.fit_transform(airbnb_data[['reviews_per_month']])


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [None]:
def categorize_price(price):
    if price <= 50:
        return 'Budget'
    elif price <= 100:
        return 'Economy'
    elif price <= 200:
        return 'Standard'
    elif price <= 500:
        return 'Premium'
    else:
        return 'Luxury'

In [None]:
airbnb_data['price_category'] = airbnb_data['price'].apply(categorize_price)

In [None]:
airbnb_data['popularity_score'] = (airbnb_data['number_of_reviews'] / (airbnb_data['days_since_last_review'] + 1)) * 100
airbnb_data['popularity_score'] = airbnb_data['popularity_score'].clip(upper=100)

In [None]:
airbnb_data['estimated_annual_revenue'] = airbnb_data['price'] * (airbnb_data['availability_365'] * 0.5)

In [None]:
airbnb_data['host_type'] = airbnb_data['calculated_host_listings_count'].apply(
    lambda x: 'Professional' if x > 10 else ('Multiple Listings' if x > 1 else 'Single Listing')
)

In [None]:
airbnb_data[['id', 'price', 'price_category', 'days_since_last_review', 'popularity_score',
             'estimated_annual_revenue', 'host_type']].head()

Unnamed: 0,id,price,price_category,days_since_last_review,popularity_score,estimated_annual_revenue,host_type
0,2539,149,Standard,262.0,3.422053,27192.5,Multiple Listings
1,2595,225,Premium,48.0,91.836735,39937.5,Multiple Listings
2,3647,150,Standard,365.0,0.0,27375.0,Single Listing
3,3831,89,Economy,3.0,100.0,8633.0,Single Listing
4,5022,80,Economy,231.0,3.87931,0.0,Single Listing


## analysis questions
1. How does the price vary across different neighborhoods?
2. What factors influence a listing's popularity score?
3. How does the room type affect the estimated annual revenue?

In [None]:
fig1 = px.histogram(airbnb_data, x='price', nbins=50, title='distribution of listing prices',
                    log_y=True, labels={'price': 'Price'})
fig1.update_layout(bargap=0.2)
fig1.show()

In [None]:
fig2 = px.bar(airbnb_data['room_type'].value_counts().reset_index(),
              x='room_type', y='count', title='distribution of room types',
              labels={'room_type': 'room type', 'count': 'count'})
fig2.show()

In [None]:
fig3 = px.box(airbnb_data, x='neighbourhood_group', y='price', title='price distribution by borough',
              labels={'neighbourhood_group': 'neighbourhood_group', 'price': 'price'})
fig3.show()

In [None]:
fig4 = px.scatter(airbnb_data, x='price', y='popularity_score', color='room_type',
                  title='popularity score and price by room type',
                  labels={'price': 'price', 'popularity_score': 'popularity_score'})
fig4.show()

In [None]:
fig5 = px.violin(airbnb_data, x='room_type', y='estimated_annual_revenue',
                 title='estimated annual revenue by room type',
                 labels={'room_type': 'room type', 'estimated_annual_revenue': 'estimated_annual_revenue'})
fig5.show()

In [None]:
corr = airbnb_data[['price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month',
                    'calculated_host_listings_count', 'availability_365', 'popularity_score']].corr()
fig6 = ff.create_annotated_heatmap(corr.values, x=corr.columns.tolist(), y=corr.index.tolist(),
                                   colorscale='RdBu_r', showscale=True)
fig6.update_layout(title='correlation_matrix')
fig6.show()

In [None]:
# average price by room type
avg_price_room = airbnb_data.groupby('room_type')['price'].mean().reset_index()
print(avg_price_room)

         room_type       price
0  Entire home/apt  211.794246
1     Private room   89.780973
2      Shared room   70.127586


In [None]:

# percentage of listings by host_type
host_type_percentage = airbnb_data['host_type'].value_counts(normalize=True) * 100
print(host_type_percentage)

host_type
Single Listing       66.066060
Multiple Listings    27.865835
Professional          6.068105
Name: proportion, dtype: float64


In [None]:

#neighbourhoods by average_popularity_score
avg_neighbourhoods = airbnb_data.groupby('neighbourhood')['popularity_score'].mean().nlargest(5)
print(avg_neighbourhoods)

neighbourhood
Huguenot        100.0
Neponsit        100.0
Richmondtown    100.0
Rossville       100.0
Silver Lake     100.0
Name: popularity_score, dtype: float64
