In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string

import langid

from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import Counter
from scipy import stats
from dateutil import parser

import seaborn as sns

import shapely
from shapely.geometry import Point
import geopandas as gpd

In [None]:
%matplotlib inline

# Read In Data
* **TokyoAirbnbListings2023.csv**: includes description of listings, transit, if superhost...
* **TokyoAirbnbReviews2023.csv**: includes listing id and unique id for each reviewer with comments and dates

In [None]:
listings = pd.read_csv('/kaggle/input/tokyo-airbnb-neighborhoods/TokyoAirbnbListings2023.csv')
reviews = pd.read_csv("/kaggle/input/tokyo-airbnb-neighborhoods/TokyoAirbnbReviews2023.csv")
reviews = reviews.tail(5000)

# Reset the index so it starts from 0
reviews.reset_index(drop=True, inplace=True)


In [None]:
listings.head(3)

In [None]:
reviews.head(3)

In [None]:
reviews['comments'][1]

# Preprocessing Reviews for Sentiment Analysis

In [None]:
reviews.isnull().sum()

In [None]:
reviews['comments'].str.match('The host canceled this reservation').sum()

In [None]:
reviews.dropna(subset =['comments'], how='any', axis=0, inplace=True)

index_canceled = reviews[ reviews['comments'].str.match('The host canceled this reservation')].index
reviews.drop(index_canceled, inplace=True)

index_dash = reviews[ reviews['comments'].str.match('-')].index
reviews.drop(index_dash, inplace=True)

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
reviews['comments'] = reviews['comments'].map(alphanumeric)

In [None]:
reviews.reset_index(inplace=True, drop=True)

In [None]:
num_errors = 0
err = []

for index, row in reviews.iterrows():
    try:
        # Use langid to classify the language of the comment
        language, _ = langid.classify(row['comments'])
    except Exception as e:
        num_errors += 1
        err.append(index)
        print(f"This row throws an error: {row['comments']}")
        print(f"Error message: {str(e)}")

In [None]:
print('{:.2f}% of entries for language identification throw errors'.format(num_errors/reviews['comments'].shape[0] * 100))

In [None]:
reviews.drop(err, axis=0, inplace=True)

In [None]:
reviews.reset_index(inplace=True, drop=True)

In [None]:
def detect_language(comment):
    try:
        # Return only the language code
        language, _ = langid.classify(comment)
        return language
    except Exception as e:
        # Handle the exception if any and return a NaN or some default value
        print(f"Error processing comment: {comment}, Error: {e}")
        return None  # or return 'unknown' or similar

# Apply the function to the 'comments' column and create a new 'language' column
reviews['language'] = reviews['comments'].apply(detect_language)

In [None]:
reviews.head()

In [None]:
reviews['language'].value_counts().plot.bar();

In [None]:
# create a copy in case we eventually want to try translating the non-English rows instead of dropping them
#reviews_en = reviews.copy(deep=True)

# isolate all non-en entries
index_nonen = reviews[~reviews['language'].str.match('en')].index

print('{:.2f}% of all entries are not in English'.format(len(index_nonen)/reviews.shape[0]))


In [None]:
# drop
reviews.drop(index_nonen, inplace=True)

# Estimating Polarity

https://medium.com/analytics-vidhya/simplifying-social-media-sentiment-analysis-using-vader-in-python-f9e6ec6fc52f

**VADER** (Valence Aware Dictionary and sEntiment Reasoner) is a sentiment analysis tool designed with a focus on social media content, employing a lexicon and rule-based approach. It utilizes a sentiment lexicon, which is essentially a collection of words each tagged with their semantic orientation as positive or negative.

This tool has proven to be highly effective for analyzing texts from social media, as well as content from NY Times editorials, movie reviews, and product reviews. VADER excels by not just providing scores for positivity and negativity, but also by quantifying the degree of sentiment expressed.

VADER enhances analysis through several unique features: it adjusts scores for words based on capitalization, punctuation (for example, increasing the compound score with the addition of exclamation points), the use of degree modifiers (comparing phrases like "very good" versus "good"), and the presence of emojis. Additionally, it effectively manages shifts in sentiment polarity when conjunctions are used, such as in "but" phrases, to indicate a change in sentiment direction.

In [None]:
import nltk
nltk.download('vader_lexicon')

In [None]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
# test it out on first comment

def sentiment_analyzer_scores(comment):
    score = analyzer.polarity_scores(comment)
    return(pd.DataFrame.from_dict(score, orient='index'))

sentiment_analyzer_scores(reviews['comments'][1])

In [None]:
# we want one overall polarity score, so just look at the 'compound' score

polarity_compound = lambda s: (analyzer.polarity_scores(s))['compound']

In [None]:
reviews['polarity'] = reviews.comments.map(polarity_compound)

In [None]:
reviews[:5].style.bar(subset=['polarity'], align='mid', color=['#d65f5f', '#5fba7d'])

In [None]:
reviews[reviews['polarity']<0][:5].style.bar(subset=['polarity'], align='mid', color=['#d65f5f', '#5fba7d'])

In [None]:
sns.distplot(reviews['polarity'], norm_hist=True, color='g')
plt.title('Distribution of sentiment polarity');

In [None]:
reviews.to_csv('reviews_polarity.csv', index=False)

# Investigate the polarity variable and its relationship with other attributes, such as various scores. 

Extract from TokyoAirbnbListings2023.csv all columns that represent specific aspects (accuracy, cleanliness, check-in, communication, location, and value) as well as the aggregate (rating) scores. Ensure to include the 'id' column for integration with additional DataFrames and the 'neighbourhood' column for subsequent analysis.

In [None]:
sel_listings = listings[['id', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy', 
                         'review_scores_cleanliness', 'review_scores_checkin', 
                         'review_scores_communication', 'review_scores_location', 'review_scores_value',
                         'neighbourhood_cleansed']]
sel_reviews = reviews[['listing_id', 'reviewer_id', 'comments', 'polarity']]

In [None]:
sel_reviews.head(3)

In [None]:
sel_listings.head(3)

In [None]:
# Convert 'polarity' to numeric
sel_reviews.loc[:, 'polarity'] = pd.to_numeric(sel_reviews['polarity'], errors='coerce')

# Group by 'listing_id' to calculate the mean polarity
list_pol = sel_reviews.groupby('listing_id', as_index=False)['polarity'].mean()

# Merge list_pol with sel_listings on 'id' and 'listing_id'
full = pd.merge(sel_listings, list_pol, left_on='id', right_on='listing_id', how='left')

# Check the first few rows to confirm the merge is correct
full.head()

In [None]:
review_scores = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 
                 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 
                 'review_scores_value', 'polarity']
print(review_scores)

corr = full[review_scores].corr()

sns.heatmap(corr, annot=True)
plt.title('Pearson correlation between score features and polarity');

In [None]:
full['review_scores_rating'].hist();


In [None]:
full['polarity'].hist();

# Set up dataframe for dashboard


In [None]:
# drop rows with null polarity
full.dropna(subset = ['polarity'], how='any', axis=0, inplace=True)

full

In [None]:
sns.scatterplot(x='polarity', y='number_of_reviews', data=full)

In [None]:
print(listings.columns)

In [None]:
# selected columns from listings
dashboard_df = pd.DataFrame(listings[['id', 'price','minimum_nights', 'maximum_nights', 'host_is_superhost']])

# remove $ sign
# and fill null values in security_deposit and cleaning_fee with 0
cols = ['price']
for col in cols:
    dashboard_df[col] = dashboard_df[col].str.replace('$', '')
    dashboard_df[col].fillna(0, inplace=True)

# transform host_is_superhost to boolean 
dashboard_df.replace({'host_is_superhost': {'f': False, 't': True}}, inplace=True)

dashboard_df.head(3)

In [None]:
dashboard_df = pd.merge(full, dashboard_df, left_on='id', right_on='id', how='left')


In [None]:
dashboard_df


In [None]:
dashboard_df.to_csv('dashboard_df.csv', index=False)
