In [28]:
import numpy as np
import pandas as pd
import datetime as dt

In [13]:
# Step 1. Importing the data

# Load airbnb_price.csv, prices
prices = pd.read_csv("prices.csv")

# Load airbnb_room_type.xlsx, xls
xls = pd.ExcelFile("room_types.xlsx")

# Parse the first sheet from xls, room_types
room_types = xls.parse(0)

# Load airbnb_last_review.tsv, reviews
reviews = pd.read_csv("reviews.tsv", sep="\t")

print(prices)
print(room_types)
print(reviews)

       listing_id        price                 nbhood_full
0            2595  225 dollars          Manhattan, Midtown
1            3831   89 dollars      Brooklyn, Clinton Hill
2            5099  200 dollars      Manhattan, Murray Hill
3            5178   79 dollars   Manhattan, Hell's Kitchen
4            5238  150 dollars        Manhattan, Chinatown
...           ...          ...                         ...
25204    36425863  129 dollars  Manhattan, Upper East Side
25205    36427429   45 dollars            Queens, Flushing
25206    36438336  235 dollars  Staten Island, Great Kills
25207    36442252  100 dollars           Bronx, Mott Haven
25208    36455809   30 dollars          Brooklyn, Bushwick

[25209 rows x 3 columns]
       listing_id                                 description        room_type
0            2595                       Skylit Midtown Castle  Entire home/apt
1            3831             Cozy Entire Floor of Brownstone  Entire home/apt
2            5099   Large Coz

In [22]:
# Step 2. Cleaning the price column

# Converting prices column to string datatype

prices["price"] = prices["price"].astype(str)

# Remove whitespace and string characters from prices column
prices["price"] = prices["price"].str.replace(" dollars", "")

# Convert prices column to numeric datatype
prices["price"] = pd.to_numeric(prices["price"])

print(prices)

       listing_id  price                 nbhood_full
0            2595    225          Manhattan, Midtown
1            3831     89      Brooklyn, Clinton Hill
2            5099    200      Manhattan, Murray Hill
3            5178     79   Manhattan, Hell's Kitchen
4            5238    150        Manhattan, Chinatown
...           ...    ...                         ...
25204    36425863    129  Manhattan, Upper East Side
25205    36427429     45            Queens, Flushing
25206    36438336    235  Staten Island, Great Kills
25207    36442252    100           Bronx, Mott Haven
25208    36455809     30          Brooklyn, Bushwick

[25209 rows x 3 columns]


In [38]:
# Step 3. Calculating average price

# Subset prices for listings costing $0, free_listings
free_listings = prices["price"] == 0

# Update prices by removing all free listings from prices
prices = prices.loc[~free_listings]

# Calculate the average price, avg_price
avg_price = round(prices["price"].mean(), 2)

if free_listings.any():
    print(free_listings[free_listings])
else:
    print("No True values in free_listings")
print()
print(prices)
print()
print(f"avg_price: {avg_price:.2f}")

No True values in free_listings

       listing_id  price                 nbhood_full
0            2595    225          Manhattan, Midtown
1            3831     89      Brooklyn, Clinton Hill
2            5099    200      Manhattan, Murray Hill
3            5178     79   Manhattan, Hell's Kitchen
4            5238    150        Manhattan, Chinatown
...           ...    ...                         ...
25204    36425863    129  Manhattan, Upper East Side
25205    36427429     45            Queens, Flushing
25206    36438336    235  Staten Island, Great Kills
25207    36442252    100           Bronx, Mott Haven
25208    36455809     30          Brooklyn, Bushwick

[25202 rows x 3 columns]

avg_price: 141.82


In [45]:
# Step 4. Comparing costs to the private rental market

# Add a new column to the prices DataFrame, price_per_month
prices["price_per_month"] = round(prices["price"] * 365 / 12, 2)

# Calculate average_price_per_month
average_price_per_month = round(prices["price_per_month"].mean(), 2)
difference = round((average_price_per_month - 3100),2)

print(prices)
print()
print(f"average_price_per_month: {average_price_per_month:.2f}")
print()
print(f"airbnb_minus_private_monthly: {difference:.2f}")

       listing_id  price                 nbhood_full  price_per_month
0            2595    225          Manhattan, Midtown          6843.75
1            3831     89      Brooklyn, Clinton Hill          2707.08
2            5099    200      Manhattan, Murray Hill          6083.33
3            5178     79   Manhattan, Hell's Kitchen          2402.92
4            5238    150        Manhattan, Chinatown          4562.50
...           ...    ...                         ...              ...
25204    36425863    129  Manhattan, Upper East Side          3923.75
25205    36427429     45            Queens, Flushing          1368.75
25206    36438336    235  Staten Island, Great Kills          7147.92
25207    36442252    100           Bronx, Mott Haven          3041.67
25208    36455809     30          Brooklyn, Bushwick           912.50

[25202 rows x 4 columns]

average_price_per_month: 4313.61

airbnb_minus_private_monthly: 1213.61


In [48]:
# Step 5. Cleaning the room_type column

# Convert the room_type column to lowercase
room_types["room_type"] = room_types["room_type"].str.lower()

# Update the room_type column to category data type
room_types["room_type"] = room_types["room_type"].astype("category")

# Create the variable room_frequencies
room_frequencies = room_types["room_type"].value_counts()

print(room_types)
print()
print(room_frequencies)

       listing_id                                 description        room_type
0            2595                       Skylit Midtown Castle  entire home/apt
1            3831             Cozy Entire Floor of Brownstone  entire home/apt
2            5099   Large Cozy 1 BR Apartment In Midtown East  entire home/apt
3            5178             Large Furnished Room Near B'way     private room
4            5238          Cute & Cozy Lower East Side 1 bdrm  entire home/apt
...           ...                                         ...              ...
25204    36425863  Lovely Privet Bedroom with Privet Restroom     private room
25205    36427429                    No.2 with queen size bed     private room
25206    36438336                             Seas The Moment     private room
25207    36442252               1B-1B apartment near by Metro  entire home/apt
25208    36455809     Cozy Private Room in Bushwick, Brooklyn     private room

[25209 rows x 3 columns]

room_type
entire home/apt

In [51]:
# Step 6. What timeframe are we working with?

# Change the data type of the last_review column to datetime
reviews["last_review"] = pd.to_datetime(reviews["last_review"])

# Create first_reviewed, the earliest review date
first_reviewed = reviews["last_review"].dt.date.min()

# Create last_reviewed, the most recent review date
last_reviewed = reviews["last_review"].dt.date.max()

print(reviews)
print()
print(f"earliest_review_date: {first_reviewed}")
print()
print(f"latest_review_date: {last_reviewed}")

       listing_id    host_name last_review
0            2595     Jennifer  2019-05-21
1            3831  LisaRoxanne  2019-07-05
2            5099        Chris  2019-06-22
3            5178     Shunichi  2019-06-24
4            5238          Ben  2019-06-09
...           ...          ...         ...
25204    36425863        Rusaa  2019-07-07
25205    36427429         H Ai  2019-07-07
25206    36438336          Ben  2019-07-07
25207    36442252       Blaine  2019-07-07
25208    36455809    Christine  2019-07-08

[25209 rows x 3 columns]

earliest_review_date: 2019-01-01

latest_review_date: 2019-07-09


In [53]:
# Step 7. Joining the DataFrames

# Merge prices and room_types to create rooms_and_prices
rooms_and_prices = prices.merge(room_types, how="outer", on="listing_id")

# Merge rooms_and_prices with the reviews DataFrame to create airbnb_merged
airbnb_merged = rooms_and_prices.merge(reviews, how="outer", on="listing_id")

# Drop missing values from airbnb_merged
airbnb_merged.dropna(inplace=True)

print(airbnb_merged)

       listing_id  price                 nbhood_full  price_per_month  \
0            2595  225.0          Manhattan, Midtown          6843.75   
1            3831   89.0      Brooklyn, Clinton Hill          2707.08   
2            5099  200.0      Manhattan, Murray Hill          6083.33   
3            5178   79.0   Manhattan, Hell's Kitchen          2402.92   
4            5238  150.0        Manhattan, Chinatown          4562.50   
...           ...    ...                         ...              ...   
25197    36425863  129.0  Manhattan, Upper East Side          3923.75   
25198    36427429   45.0            Queens, Flushing          1368.75   
25199    36438336  235.0  Staten Island, Great Kills          7147.92   
25200    36442252  100.0           Bronx, Mott Haven          3041.67   
25201    36455809   30.0          Brooklyn, Bushwick           912.50   

                                      description        room_type  \
0                           Skylit Midtown Castle  en

In [56]:
# Step 8. Analyzing listing prices by NYC borough

# Extract information from the nbhood_full column and store as a new column, borough
airbnb_merged["borough"] = airbnb_merged["nbhood_full"].str.partition(",")[0]

# Group by borough and calculate summary statistics
boroughs = airbnb_merged.groupby("borough")["price"].agg(["sum", "mean", "median", "count"])

# Round boroughs to 2 decimal places, and sort by mean in descending order
boroughs = boroughs.round(2).sort_values("mean", ascending=False)

print(airbnb_merged)
print()
print(boroughs)

       listing_id  price                 nbhood_full  price_per_month  \
0            2595  225.0          Manhattan, Midtown          6843.75   
1            3831   89.0      Brooklyn, Clinton Hill          2707.08   
2            5099  200.0      Manhattan, Murray Hill          6083.33   
3            5178   79.0   Manhattan, Hell's Kitchen          2402.92   
4            5238  150.0        Manhattan, Chinatown          4562.50   
...           ...    ...                         ...              ...   
25197    36425863  129.0  Manhattan, Upper East Side          3923.75   
25198    36427429   45.0            Queens, Flushing          1368.75   
25199    36438336  235.0  Staten Island, Great Kills          7147.92   
25200    36442252  100.0           Bronx, Mott Haven          3041.67   
25201    36455809   30.0          Brooklyn, Bushwick           912.50   

                                      description        room_type  \
0                           Skylit Midtown Castle  en

In [59]:
# Step 9. Price range by borough

# Create labels for the price range, label_names
label_names = ["Budget", "Average", "Expensive", "Extravagant"]

# Create the label ranges, ranges
ranges = [0, 69, 175, 350, np.inf]

# Insert new column, price_range, into DataFrame
airbnb_merged["price_range"] = pd.cut(airbnb_merged["price"], bins=ranges, labels=label_names)

# Calculate occurence frequencies for each label, prices_by_borough
prices_by_borough = airbnb_merged.groupby(["borough", "price_range"])["price_range"].count()

print(airbnb_merged)
print()
print(prices_by_borough)

       listing_id  price                 nbhood_full  price_per_month  \
0            2595  225.0          Manhattan, Midtown          6843.75   
1            3831   89.0      Brooklyn, Clinton Hill          2707.08   
2            5099  200.0      Manhattan, Murray Hill          6083.33   
3            5178   79.0   Manhattan, Hell's Kitchen          2402.92   
4            5238  150.0        Manhattan, Chinatown          4562.50   
...           ...    ...                         ...              ...   
25197    36425863  129.0  Manhattan, Upper East Side          3923.75   
25198    36427429   45.0            Queens, Flushing          1368.75   
25199    36438336  235.0  Staten Island, Great Kills          7147.92   
25200    36442252  100.0           Bronx, Mott Haven          3041.67   
25201    36455809   30.0          Brooklyn, Bushwick           912.50   

                                      description        room_type  \
0                           Skylit Midtown Castle  en

In [73]:
# Step 10. Storing the final result

solution = {'avg_price':avg_price,
            'average_price_per_month': average_price_per_month,  
            'difference':difference,          
            'room_frequencies':room_frequencies, 
            'first_reviewed': first_reviewed,
            'last_reviewed': last_reviewed,
            'prices_by_borough':prices_by_borough}

print(f"avg_price: {avg_price:.2f}")
print()
print(f"average_price_per_month: {average_price_per_month:.2f}")
print()
print(f"airbnb_minus_private_monthly: {difference}")
print()
print(room_frequencies)
print()
print(f"earliest_review_date: {first_reviewed}")
print()
print(f"latest_review_date: {last_reviewed}")
print()
print(prices_by_borough)

avg_price: 141.82

average_price_per_month: 4313.61

airbnb_minus_private_monthly: 1213.61

room_type
entire home/apt    13266
private room       11356
shared room          587
Name: count, dtype: int64

earliest_review_date: 2019-01-01

latest_review_date: 2019-07-09

borough        price_range
Bronx          Budget          381
               Average         285
               Expensive        25
               Extravagant       5
Brooklyn       Budget         3194
               Average        5532
               Expensive      1466
               Extravagant     259
Manhattan      Budget         1148
               Average        5285
               Expensive      3072
               Extravagant     810
Queens         Budget         1631
               Average        1505
               Expensive       291
               Extravagant      28
Staten Island  Budget          124
               Average         123
               Expensive        20
               Extravagant       0
Nam