# EDA - Sample of NYC Airbnb data

## Get the data

In [None]:
import wandb
import pandas as pd

# Start tracking the notebook and use the latest sample
run = wandb.init(project="nyc_airbnb", group="eda", save_code=True)
local_path = wandb.use_artifact("sample.csv:latest").file()
df = pd.read_csv(local_path)

## Perform EDA with pandas_profiling

In [None]:
import pandas_profiling

profile = pandas_profiling.ProfileReport(df)
profile.to_widgets()

### Notes
- There are missing values in a number of columns
- 'last_review' should be converted to datetime
- Price column contains outliers which may impact the model (e.g. 0s or extremely large values)

## Clean the data

In [None]:
# Drop outliers
min_price = 10
max_price = 350
idx = df['price'].between(min_price, max_price)
df = df[idx].copy()

# Convert last_review to datetime
df['last_review'] = pd.to_datetime(df['last_review'])

In [None]:
# Check if everything looks
df.info() 

## End the W&B run

In [None]:
run.finish()