## Prepare datasets for streamlit app
1. explore listing related dataframes
2. explore review related dataframes
3. prepare final listing dataframe for streamlit app.

In [None]:
import pandas as pd

#### 1. explore listing related dataframes

In [None]:
# explore listing related dataframes
listing_df = pd.read_pickle('../data/data_cleaned/cleaned_listing.zip')
listing_with_polarity_df = pd.read_pickle('../data/data_cleaned/cleaned_listing_and_review_with_polarity.zip')
listing_with_polarity_and_text_content_df = pd.read_pickle('../data/data_cleaned/cleaned_listing_and_review_with_polarity_and_text_content.zip')
listing_with_cluster_similarity_polarity_df = pd.read_pickle('../data/data_cleaned/listings_with_cluster_similarity_polarity.zip')

print(listing_df.shape)
print(listing_with_polarity_df.shape)
print(listing_with_polarity_and_text_content_df.shape)
print(listing_with_cluster_similarity_polarity_df.shape)

(4933, 71)
(4933, 73)
(4933, 75)
(4933, 47)


In [None]:
# check added columns 
# listing_with_polarity_and_text_content_df vs listing_df
listing_with_polarity_and_text_content_df.loc[:,~listing_with_polarity_and_text_content_df.columns.isin(listing_df.columns)].columns

Index(['comments', 'polarity', 'content', 'cleaned_content'], dtype='object')

In [None]:
# check added columns
# listing_with_cluster_similarity_polarity_df vs listing_with_polarity_and_text_content_df
listing_with_cluster_similarity_polarity_df.loc[:,~listing_with_cluster_similarity_polarity_df.columns.isin(listing_with_polarity_and_text_content_df.columns)].columns

Index(['cluster', 'similarity'], dtype='object')

##### Findings:

6 new columns ['comments', 'polarity', 'content', 'cleaned_content','cluster', 'similarity'] are added comparing to the cleand_listing.zip file.

#### 2. explore review related dataframes

In [None]:
# explore review related dataframes
review_df = pd.read_pickle('../data/data_cleaned/cleaned_review.zip')
review_with_polarity_df = pd.read_pickle('../data/data_cleaned/cleaned_review_with_polarity.zip')
review_for_wordcloud_df = pd.read_pickle('../data/data_cleaned/cleaned_review_for_review_wordcloud.zip')
review_with_polarity_topic_df = pd.read_pickle('../data/data_cleaned/cleaned_review_with_polarity_and_topic.zip')

print(review_df.shape)
print(review_with_polarity_df.shape)
print(review_for_wordcloud_df.shape)
print(review_with_polarity_topic_df.shape)


(375173, 4)
(375173, 5)
(5091, 4)
(375173, 9)


In [None]:
# check columns
print(review_df.columns.tolist())
print(review_with_polarity_df.columns.tolist())
print(review_for_wordcloud_df.columns.tolist())
print(review_with_polarity_topic_df.columns.tolist())

['listing_id', 'date', 'reviewer_id', 'comments']
['listing_id', 'date', 'reviewer_id', 'comments', 'polarity']
['listing_id', 'comments', 'cleaned_comments', 'comments_nouns_adjs']
['listing_id', 'date', 'reviewer_id', 'comments', 'polarity', 'cleaned_comments', 'comments_nouns_adjs', 'review_topic', 'review_topic_interpreted']


##### Findings:
5 new columns ['polarity','cleaned_comments', 'comments_nouns_adjs','review_topic', 'review_topic_interpreted'] added comparing to cleaned_review.zip file.


#### 3. prepare final listing dataframe for streamlit app.

In [None]:
# add 'cluster' and 'similarity' cols 
cleaned_listing_finalized_for_streamlit_df = listing_with_polarity_and_text_content_df.join (listing_with_cluster_similarity_polarity_df[['cluster', 'similarity']])
cleaned_listing_finalized_for_streamlit_df.shape

(4933, 77)

In [None]:
# add 'cleaned_comments' and 'comments_nouns_adjs' cols 
cleaned_listing_finalized_for_streamlit_df = cleaned_listing_finalized_for_streamlit_df.merge(review_for_wordcloud_df.loc[:,['listing_id','cleaned_comments', 'comments_nouns_adjs']], on='listing_id', how = 'left' )
cleaned_listing_finalized_for_streamlit_df.shape

(4933, 79)

In [None]:
# cleaned_listing_finalized_for_streamlit_df.columns

In [None]:
# check all (8) added cols  compared to cleaned_listing.zip
# (starting from 'comments' to 'comments_nouns_adjs') 
cleaned_listing_finalized_for_streamlit_df.columns[-10:]

Index(['host_response_time_encoded', 'host_operate_years', 'comments',
       'polarity', 'content', 'cleaned_content', 'cluster', 'similarity',
       'cleaned_comments', 'comments_nouns_adjs'],
      dtype='object')

In [None]:
# save the prepared cleaned_listing_finalized_for_streamlit_df to file 
cleaned_listing_finalized_for_streamlit_df.to_pickle('data/data_cleaned/cleaned_listing_finalized_for_streamlit.zip')

#### Summary
1. **use cleaned_listing_finalized_for_streamlit.zip file (4933,79) for streamlit app**. Except the interpreted cluster column, the cleaned_listing_finalized_for_streamlit_df contains all the features we added to the original cleand_listing.zip data.
2. **use cleaned_review_with_polarity_and_topic.zip file (375173, 9) for review sentiment trends and review topics plotting sections**. It has review date, polarity, and topic information so we can plot the time series to see the trends and the review topic distribution for a given listing.

In [None]:
# check the finalized dataset
df = pd.read_pickle('../data/data_cleaned/cleaned_listing_finalized_for_streamlit.zip')
df.shape

(4933, 79)

In [None]:
df.head(2)

Unnamed: 0,listing_id,listing_url,last_scraped,listing_name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,...,host_response_time_encoded,host_operate_years,comments,polarity,content,cleaned_content,cluster,similarity,cleaned_comments,comments_nouns_adjs
0,49113826,https://www.airbnb.com/rooms/49113826,2022-12-24,Private Seattle Getaway Home Near Lake Washing...,A Seattle vacation destination for those from ...,SeaTac International Airport: 14 min drive<br ...,https://a0.muscache.com/pictures/miso/Hosting-...,188538325,https://www.airbnb.com/users/show/188538325,Xiao,...,1,4,Unexpectedly the place was like new home since...,0.817365,Private Seattle Getaway Home Near Lake Washing...,private seattle getaway home near lake washing...,0,1.0,unexpectedly place like new home got recently ...,place new home xiao reponsive easy communicate...
1,7455832,https://www.airbnb.com/rooms/7455832,2022-12-24,Classic remodeled in Georgetown,Enjoy your Seattle stay in the vibrant & artis...,Georgetown is a cool neighborhood about three ...,https://a0.muscache.com/pictures/miso/Hosting-...,2144954,https://www.airbnb.com/users/show/2144954,Ryan,...,1,10,The house is so comfortable and clean. It has ...,0.913648,Classic remodeled in Georgetown Enjoy your Sea...,classic remodeled georgetown enjoy seattle via...,2,0.799051,house comfortable clean everything need make f...,house comfortable clean everything home floor ...


In [None]:
df.columns

Index(['listing_id', 'listing_url', 'last_scraped', 'listing_name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_picture_url', 'host_neighbourhood',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bedrooms',
       'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights',
       'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'av

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f2a50dc6-ff6a-45ff-9dbe-d7a35bd1e393' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>