Exploratory Data analysis

In [1]:
import wandb
import numpy as np
import pandas as pd
import ydata_profiling as pdp

  from .autonotebook import tqdm as notebook_tqdm


reading the artifact and converting to csv

In [2]:
# read-in of sample.csv file from w&b
run = wandb.init(project='nyc_airbnb', group='eda', save_code=True)
local_path = wandb.use_artifact('sample.csv:latest').file()
df_sample = pd.read_csv(local_path)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mahmed-elazab262[0m ([33mahmed-elazab262-ejada[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# first info of raw data set
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20000 non-null  int64  
 1   name                            19993 non-null  object 
 2   host_id                         20000 non-null  int64  
 3   host_name                       19992 non-null  object 
 4   neighbourhood_group             20000 non-null  object 
 5   neighbourhood                   20000 non-null  object 
 6   latitude                        20000 non-null  float64
 7   longitude                       20000 non-null  float64
 8   room_type                       20000 non-null  object 
 9   price                           20000 non-null  int64  
 10  minimum_nights                  20000 non-null  int64  
 11  number_of_reviews               20000 non-null  int64  
 12  last_review                     

Data cleaning

In [4]:
# drop outliers
min_price = 10
max_price = 350
idx = df_sample['price'].between(min_price, max_price)
df_clean = df_sample[idx].copy()

In [5]:
# convert 'last_review' to datetime
df_clean['last_review'] = pd.to_datetime(df_clean['last_review'])

In [6]:
# normal distribution of 'minimum_nights'
df_clean['minimum_nights'] = np.log(df_clean['minimum_nights'])

In [7]:
df_clean.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,9138664,Private Lg Room 15 min to Manhattan,47594947,Iris,Queens,Sunnyside,40.74271,-73.92493,Private room,74,0.693147,6,2019-05-26,0.13,1,5
1,31444015,TIME SQUARE CHARMING ONE BED IN HELL'S KITCHEN...,8523790,Johlex,Manhattan,Hell's Kitchen,40.76682,-73.98878,Entire home/apt,170,1.098612,0,NaT,,1,188
2,8741020,Voted #1 Location Quintessential 1BR W Village...,45854238,John,Manhattan,West Village,40.73631,-74.00611,Entire home/apt,245,1.098612,51,2018-09-19,1.12,1,0
3,34602077,Spacious 1 bedroom apartment 15min from Manhattan,261055465,Regan,Queens,Astoria,40.76424,-73.92351,Entire home/apt,125,1.098612,1,2019-05-24,0.65,1,13
4,23203149,Big beautiful bedroom in huge Bushwick apartment,143460,Megan,Brooklyn,Bushwick,40.69839,-73.92044,Private room,65,0.693147,8,2019-06-23,0.52,2,8


### Profiling of cleaned data

In [8]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19001 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              19001 non-null  int64         
 1   name                            18994 non-null  object        
 2   host_id                         19001 non-null  int64         
 3   host_name                       18993 non-null  object        
 4   neighbourhood_group             19001 non-null  object        
 5   neighbourhood                   19001 non-null  object        
 6   latitude                        19001 non-null  float64       
 7   longitude                       19001 non-null  float64       
 8   room_type                       19001 non-null  object        
 9   price                           19001 non-null  int64         
 10  minimum_nights                  19001 non-null  float64       
 11  number_

In [9]:
df_clean.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
count,19001.0,19001.0,19001.0,19001.0,19001.0,19001.0,19001.0,15243,15243.0,19001.0,19001.0
mean,18830410.0,66394590.0,40.728063,-73.950827,122.340456,1.12122,23.797747,2018-10-01 08:24:05.384766720,1.380928,6.583811,109.725383
min,2539.0,2571.0,40.50873,-74.23914,10.0,0.0,0.0,2011-05-12 00:00:00,0.01,1.0,0.0
25%,9355498.0,7728754.0,40.68882,-73.98205,66.0,0.0,1.0,2018-06-29 12:00:00,0.19,1.0,0.0
50%,19387540.0,30487850.0,40.72171,-73.95463,100.0,0.693147,6.0,2019-05-19 00:00:00,0.72,1.0,39.0
75%,28919520.0,104835400.0,40.76321,-73.93449,160.0,1.609438,24.0,2019-06-23 00:00:00,2.01,2.0,219.0
max,36485610.0,274273300.0,40.91306,-73.71795,350.0,7.130899,607.0,2019-07-08 00:00:00,27.95,327.0,365.0
std,10969860.0,77826630.0,0.055389,0.046825,71.530346,1.064895,45.493455,,1.689988,31.15475,130.599899


In [10]:
# create new profiling report of cleaned dataframe
profile = pdp.ProfileReport(df_clean, title="Pandas Profiling Report - Cleaned DataFrame - NYC Airbnb")

In [11]:
profile.to_widgets()

 25%|██▌       | 4/16 [00:00<00:00, 57.67it/s]0<00:00, 40.44it/s, Describe variable: availability_365]
Summarize dataset:  67%|██████▋   | 14/21 [00:00<00:00, 40.70it/s, Describe variable: availability_365]


AttributeError: 'float' object has no attribute 'ndim'

In [12]:
run.finish()

