In [1]:
# Run this to modify config file and restart to apply before rerun the entire notebook

# Append the configuration line to increase IOPub message rate limit in ~/.jupyter/jupyter_notebook_config.py
#c.ServerApp.iopub_msg_rate_limit = 10000000  (in CLI)

# Verify that the configuration line was added
!grep "c.ServerApp.iopub_msg_rate_limit" $(jupyter --config-dir)/jupyter_notebook_config.py

c.ServerApp.iopub_msg_rate_limit = 10000000


In [2]:
!pip install ydata-profiling --quiet # substitute of pandas_profiling
#!pip install ipywidgets --quiet  not used in this notebook

[0m

In [3]:
import warnings
import os
import wandb
import pandas as pd
from ydata_profiling import ProfileReport

In [4]:
#Some configuration setup for the notebook

#Suppress warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

In [5]:
#Set manually notebook name to enable code saving to W&B
os.environ["WANDB_NOTEBOOK_NAME"] = 'EDA.ipynb'

#Start run with W&B
run = wandb.init(
                 project="nyc_airbnb", 
                 group="eda", 
                 save_code=True)

#Fetch the artifact using W&B
local_path = wandb.use_artifact("sample.csv:latest").file()
df = pd.read_csv(local_path)

#Show dataset head
df.head()

[34m[1mwandb[0m: Currently logged in as: [33mdace[0m ([33mdace-[0m). Use [1m`wandb login --relogin`[0m to force relogin


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [6]:
# Note: I avoid using ipywidgets due to recent conflicts with jupiterlab
#Generate a profile
profile = ProfileReport(df, 
                        title="Profiling Report", 
                        explorative=True)
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [7]:
# Drop outliers
min_price = 10
max_price = 350
idx = df['price'].between(min_price, max_price)
df = df[idx].copy()

# Convert last_review to datetime
df['last_review'] = pd.to_datetime(df['last_review'])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46428 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              46428 non-null  int64         
 1   name                            46413 non-null  object        
 2   host_id                         46428 non-null  int64         
 3   host_name                       46407 non-null  object        
 4   neighbourhood_group             46428 non-null  object        
 5   neighbourhood                   46428 non-null  object        
 6   latitude                        46428 non-null  float64       
 7   longitude                       46428 non-null  float64       
 8   room_type                       46428 non-null  object        
 9   price                           46428 non-null  int64         
 10  minimum_nights                  46428 non-null  int64         
 11  number_

In [9]:
#Save file as html
profile.to_file("profile_report.html")

# Create a new artifact
artifact = wandb.Artifact(
                          name="nyc_airbnb_profile", 
                          type="report",
                          description="report file containing eda",
                         )

# Add the file to the artifact
artifact.add_file("profile_report.html")

# Log the artifact
run.log_artifact(artifact)

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

<Artifact nyc_airbnb_profile>

In [10]:
run.finish()

VBox(children=(Label(value='45.620 MB of 45.620 MB uploaded (9.497 MB deduped)\r'), FloatProgress(value=1.0, m…