# Short-Term Rental Prices in NYC: EDA

This notebook is dedicated to perform EDA on a sample dataset pulled from W&B.

In [1]:
import os
import wandb
import ydata_profiling
import matplotlib.pyplot as plt
import pandas as pd

## Pulling data from W&B

Downloading sample file and converting it to Pandas DataFrame.

In [2]:
run = wandb.init(project="nyc_airbnb",
                 group="development",
                 job_type="eda",
                 save_code=True
                )
file_pth = run.use_artifact("nyc_airbnb/sample.csv:v0").file()
df = pd.read_csv(file_pth)

[34m[1mwandb[0m: Currently logged in as: [33mvpolovnikov[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
! ls artifacts/

sample.csv:v0


## EDA

In [4]:
df.head(2)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,9138664,Private Lg Room 15 min to Manhattan,47594947,Iris,Queens,Sunnyside,40.74271,-73.92493,Private room,74,2,6,2019-05-26,0.13,1,5
1,31444015,TIME SQUARE CHARMING ONE BED IN HELL'S KITCHEN...,8523790,Johlex,Manhattan,Hell's Kitchen,40.76682,-73.98878,Entire home/apt,170,3,0,,,1,188


In [5]:
print(df.dtypes)

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object


In [6]:
df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,15877.0,20000.0,20000.0
mean,18923800.0,67460340.0,40.728455,-73.952125,153.26905,6.9921,23.2741,1.377446,6.95545,112.9012
std,11012230.0,78579360.0,0.054755,0.046559,243.325609,21.645449,44.927793,1.683006,32.433831,131.762226
min,2539.0,2571.0,40.50873,-74.23914,0.0,1.0,0.0,0.01,1.0,0.0
25%,9393540.0,7853718.0,40.68942,-73.98303,69.0,1.0,1.0,0.19,1.0,0.0
50%,19521170.0,31114310.0,40.72273,-73.95564,105.0,2.0,5.0,0.72,1.0,44.0
75%,29129360.0,106842600.0,40.76299,-73.93638,175.0,5.0,23.0,2.01,2.0,229.0
max,36485610.0,274273300.0,40.91306,-73.71795,10000.0,1250.0,607.0,27.95,327.0,365.0


In [7]:
profile = ydata_profiling.ProfileReport(df)

In [8]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  is_valid_dtype = pdt.is_categorical_dtype(series) and not pdt.is_bool_dtype(
  not pdt.is_categorical_dtype(series)
  is_valid_dtype = pdt.is_categorical_dtype(series) and not pdt.is_bool_dtype(
  not pdt.is_categorical_dtype(series)
  is_valid_dtype = pdt.is_categorical_dtype(series) and not pdt.is_bool_dtype(
  not pdt.is_categorical_dtype(series)
  is_valid_dtype = pdt.is_categorical_dtype(series) and not pdt.is_bool_dtype(
  not pdt.is_categorical_dtype(series)
  is_valid_dtype = pdt.is_categorical_dtype(series) and not pdt.is_bool_dtype(
  not pdt.is_categorical_dtype(series)
  is_valid_dtype = pdt.is_categorical_dtype(series) and not pdt.is_bool_dtype(
  not pdt.is_categorical_dtype(series)
  not pdt.is_categorical_dtype(series)
  if pdt.is_categorical_dtype(series):
  if pdt.is_categorical_dtype(series):
  is_valid_dtype = pdt.is_categorical_dtype(series) and not pdt.is_bool_dtype(
  if pdt.is_categorical_dtype(series):
  if pdt.is_categorical_dtype(series):
  is_valid_dtype =

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

## Data Transformation

In [7]:
# Dropping outlies
min_price = 10
max_price = 350
idx = df['price'].between(min_price, max_price)
idx.value_counts()

price
True     19001
False      999
Name: count, dtype: int64

In [8]:
df = df[idx].copy()

In [9]:
# Converting str to datetime for 'last_review'
df['last_review'] = pd.to_datetime(df['last_review'])
df.loc[0, 'last_review']

Timestamp('2019-05-26 00:00:00')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19001 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              19001 non-null  int64         
 1   name                            18994 non-null  object        
 2   host_id                         19001 non-null  int64         
 3   host_name                       18993 non-null  object        
 4   neighbourhood_group             19001 non-null  object        
 5   neighbourhood                   19001 non-null  object        
 6   latitude                        19001 non-null  float64       
 7   longitude                       19001 non-null  float64       
 8   room_type                       19001 non-null  object        
 9   price                           19001 non-null  int64         
 10  minimum_nights                  19001 non-null  int64         
 11  number_

In [None]:
# Terminating a W&B run
run.finish()