In [2]:
import pandas as pd
import numpy as np

# Listing

In [3]:
listing = pd.read_csv('data/listings.csv.gz')

listing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4936 entries, 0 to 4935
Data columns (total 79 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            4936 non-null   int64  
 1   listing_url                                   4936 non-null   object 
 2   scrape_id                                     4936 non-null   int64  
 3   last_scraped                                  4936 non-null   object 
 4   source                                        4936 non-null   object 
 5   name                                          4936 non-null   object 
 6   description                                   4880 non-null   object 
 7   neighborhood_overview                         2776 non-null   object 
 8   picture_url                                   4936 non-null   object 
 9   host_id                                       4936 non-null   i

In [20]:
listing[['host_is_superhost']]

Unnamed: 0,host_is_superhost
0,t
1,f
2,f
3,t
4,f
...,...
4931,f
4932,f
4933,f
4934,f


## Dimension: Listing

In [175]:
dim_listing = listing[['id', 'host_id',
                       'listing_url', 'name', 'picture_url', 'description', 'instant_bookable', 
                        'price', 'property_type', 'room_type', 
                        'accommodates', 'beds', 'bedrooms', 'bathrooms', 'bathrooms_text',
                        'latitude', 'longitude', 'neighbourhood_cleansed', 
                        'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
                        'review_scores_communication', 'review_scores_location', 'review_scores_value']].copy()

dim_listing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4936 entries, 0 to 4935
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           4936 non-null   int64  
 1   host_id                      4936 non-null   int64  
 2   listing_url                  4936 non-null   object 
 3   name                         4936 non-null   object 
 4   picture_url                  4936 non-null   object 
 5   description                  4880 non-null   object 
 6   instant_bookable             4936 non-null   object 
 7   price                        4925 non-null   object 
 8   property_type                4936 non-null   object 
 9   room_type                    4936 non-null   object 
 10  accommodates                 4936 non-null   int64  
 11  beds                         4918 non-null   float64
 12  bedrooms                     4933 non-null   float64
 13  bathrooms         

In [176]:
str_column = dim_listing.select_dtypes('object').columns

for col in str_column:
    dim_listing.loc[:, col] = dim_listing[col].where(~dim_listing[col].isin(["", "N/A"]), None)

dim_listing['instant_bookable'] = dim_listing['instant_bookable'].map({'t': True, 'f': False})

dim_listing['price'] = dim_listing['price'].str.replace(r'\$|[,]', '', regex = True)
dim_listing['price'] = dim_listing['price'].where(dim_listing['price'].notna(), None)
dim_listing['price'] = dim_listing['price'].astype('Float64')

dim_listing.to_csv('clean_data/dim_listing.csv', index = False)

## Dimension: Host

In [21]:
dim_host = listing[['host_id', 
                    'host_name',
                    'host_picture_url',
                    'host_identity_verified',
                    'host_is_superhost',
                    'host_neighbourhood',
                    'host_response_time',
                    'host_response_rate',
                    'host_since',
                    'host_about',
                    'host_url']].copy()

dim_host.drop_duplicates(inplace=True)

str_column = dim_host.select_dtypes('object').columns

for col in str_column:
    dim_host.loc[:, col] = dim_host[col].where(~dim_host[col].isin(["", "N/A"]), None)

dim_host['host_since'] = pd.to_datetime(dim_host['host_since'])
dim_host['host_response_rate'] = dim_host['host_response_rate'].str.replace('%', '')
dim_host['host_response_rate'] = dim_host['host_response_rate'].astype('Int64')

for col in ['host_identity_verified', 'host_is_superhost']:
    dim_host[col] = dim_host[col].map({'t': True, 'f': False})

dim_host['host_is_superhost'] = np.where(dim_host['host_is_superhost'].isna(), False, dim_host['host_is_superhost'])

dim_host.to_csv('clean_data/dim_host.csv', index = False)

In [22]:
dim_host[ dim_host['host_is_superhost'].isna()]

Unnamed: 0,host_id,host_name,host_picture_url,host_identity_verified,host_is_superhost,host_neighbourhood,host_response_time,host_response_rate,host_since,host_about,host_url


## Dimension: Amenities

In [182]:
dim_amenities = listing[['id', 'amenities']].copy()

dim_amenities = dim_amenities.assign(amenities = dim_amenities['amenities'].str.split(',')).explode('amenities')
dim_amenities['amenities'] = dim_amenities['amenities'].str.replace(r'\[|\]|\"', '', regex = True).str.strip()

dim_amenities.drop_duplicates(inplace=True)

dim_amenities = dim_amenities[ ~dim_amenities['amenities'].isin(['', 'N/A'])]

dim_amenities['amenities'] = (
    dim_amenities['amenities']
    .astype(str)
    .str.encode('utf-8')
    .str.decode('unicode_escape')
)

dim_amenities.to_csv('clean_data/dim_amenities.csv', index=False)

In [190]:
dim_amenities.iloc[105034]

id           919450216112985303
amenities               Hangers
Name: 3030, dtype: object

# Calendar

In [180]:
calendar = pd.read_csv('data/calendar.csv.gz')

calendar['available'] = calendar['available'].map({'f': False, 't':True})
calendar['date'] = pd.to_datetime(calendar['date'])

calendar.drop(['adjusted_price', 'price'], axis = 1, inplace=True)

calendar.to_csv('clean_data/fact_calendar.csv', index=False)