In [1]:
# =============================================================================
# MODULES
# =============================================================================
import wandb
import pandas as pd
import seaborn as sns
import pandas_profiling
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

  from .autonotebook import tqdm as notebook_tqdm


### Start run and read W&B artifact

In [2]:
run = wandb.init(project='nyc_airbnb',
                group='eda',
                save_code=True)

[34m[1mwandb[0m: Currently logged in as: [33mchristonikos[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
local_path = wandb.use_artifact('sample.csv:latest').file()
df = pd.read_csv(local_path) 



### Dataset structure and general information

In [4]:
print(df.head(5))

         id                                               name    host_id  \
0   9138664                Private Lg Room 15 min to Manhattan   47594947   
1  31444015  TIME SQUARE CHARMING ONE BED IN HELL'S KITCHEN...    8523790   
2   8741020  Voted #1 Location Quintessential 1BR W Village...   45854238   
3  34602077  Spacious 1 bedroom apartment 15min from Manhattan  261055465   
4  23203149   Big beautiful bedroom in huge Bushwick apartment     143460   

  host_name neighbourhood_group   neighbourhood  latitude  longitude  \
0      Iris              Queens       Sunnyside  40.74271  -73.92493   
1    Johlex           Manhattan  Hell's Kitchen  40.76682  -73.98878   
2      John           Manhattan    West Village  40.73631  -74.00611   
3     Regan              Queens         Astoria  40.76424  -73.92351   
4     Megan            Brooklyn        Bushwick  40.69839  -73.92044   

         room_type  price  minimum_nights  number_of_reviews last_review  \
0     Private room     74   

In [5]:
data = df.copy()
data.pop('price')
# extract number of features 
n_features = data.columns.shape[0]
# identify numerical and categorical features

# use the selector to get the categorical data
categorical_selector = selector(dtype_include =object)
numerical_selector = selector(dtype_exclude =object)

# select the data based on the selector
categorical_data = data[categorical_selector(data)]
numerical_data = data[numerical_selector(data)]

print(f'''
The dataset contains {n_features} features.
Numeric: {numerical_data.shape[1]} 
Categorical: {categorical_data.shape[1]}.
''')


The dataset contains 15 features.
Numeric: 9 
Categorical: 6.



# Feature & target distribution overview

In [6]:
_ = df.hist(figsize=(20, 14))

AttributeError: 'AxesSubplot' object has no attribute 'is_first_col'

The latitude and longitude features are the ones that seem to follow a Gaussian distribution. The other features as well as the target are skewed. Importantly, the ranges of the features can differ by orders of magnitude. This implies that scaling of the data is essential. 

In [None]:
# overview of categorical data
[categorical_data[i].value_counts() for i in categorical_data.columns] 

We observe a similar behavior for the categorical features (heavily skewded data distributions)

In [None]:
nan_perc = df.isnull().mean() * 100
nan_perc.plot(kind='bar')

We observe that the features "last_review" and "reviews_per_month" have around 20% of missing data. This is still within the percentage region that allows for data imputation. We therefore, decide to keep these features in the feature matrix. 

### Profile Report 

In [None]:
#!pip install ipywidgets

In [None]:
profile = pandas_profiling.ProfileReport(df)

In [None]:
profile

#### Comments
* Our initial observations are validated by the report.
* The room_type "Shared room" is the minority class in this category
* The availability_365 feature has a high percentage of zero-values which skew the distribution to the right.
* host_id and id features are highly correlated, therefore, one feature must be dropped
* Reviews and 'number_of_reviews| are also highly correlated

### Finish run

In [None]:
run.finish()