# Performed LDA Topic Modeling on Reviews
2023-10-28<br>
Evangeline Chang

In [1]:
import pandas as pd
from datetime import datetime
from tqdm import tqdm
import re
import warnings
warnings.filterwarnings("ignore")

import nltk
import pandas as pd
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk import pos_tag

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/evangeline/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/evangeline/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/evangeline/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/evangeline/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/evangeline/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
reviews = pd.read_csv('data/reviews_check.csv', index_col=0)
reviews.reset_index(drop=True, inplace=True)

In [3]:
# take a look at when the reviews were posted
def get_year(in_file):
    for index, row in in_file.iterrows():
        date_str = row['date']
        date_obj = datetime.strptime(date_str, "%Y-%m-%d")
        year = date_obj.year

        in_file.at[index, 'Year'] = int(year)

get_year(reviews)

review_years = reviews.groupby(['Year']).size().reset_index(name='Years')
review_years

Unnamed: 0,Year,Years
0,2017.0,8
1,2018.0,242
2,2019.0,268
3,2020.0,350
4,2021.0,382
5,2022.0,381
6,2023.0,163


In [4]:
listings = pd.read_csv('data/listings_check.csv', index_col=0)

# check if there are reviews that do not match with the listings from the listings file
unique_ids = listings['id'].unique()
len(reviews[reviews['listing_id'].isin(unique_ids)]) == len(reviews)

True

In [5]:
# filter the reviews to after 2020
review_after2020 = reviews[reviews['Year'] >= 2020]
review_after2020 = review_after2020.drop(columns=['reviewer_id', 'reviewer_name'])

In [6]:
review_after2020['comments'] = review_after2020['comments'].str.strip()
review_after2020 = review_after2020[review_after2020['comments'] != '']

# drop na
review_after2020.dropna(inplace=True)

# remove the rows with less than 5 characters
mask = review_after2020['comments'].str.len() >= 5
review_after2020 = review_after2020[mask]

review_after2020.reset_index(drop=True, inplace=True)

In [7]:
# remove <br/>
def empty_lines(text):
    return text.replace('<br/>', '')
    
review_after2020['comments'] = review_after2020['comments'].apply(empty_lines)

In [8]:
# remove host names from the comments
def english_only(text):
    text = str(text)
    text = re.sub(r'[^A-Za-z ]+', '', text)
    return text

listings['host_name'] = listings['host_name'].apply(english_only)
listings['host_name'] = listings['host_name'].replace('Chas', 'Charles')

merged_df = review_after2020.merge(listings[['id', 'host_name']], left_on='listing_id', right_on='id', how='left')

tqdm.pandas()
for index, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Processing rows"):
    name = row['host_name']
    if name != '':
        comment = row['comments']
        comment = re.sub(r'\b' + re.escape(name) + r'\b', '', comment, flags=re.IGNORECASE)
        merged_df.at[index, 'comments'] = comment

review_after2020 = merged_df.drop(columns=['id_x', 'id_y'])

Processing rows: 100%|██████████| 1270/1270 [00:00<00:00, 20828.51it/s]


In [9]:
# remove names and non-English characters from comments
tqdm.pandas()
review_after2020['comments'] = review_after2020['comments'].progress_apply(english_only)

100%|██████████| 1270/1270 [00:00<00:00, 96168.37it/s]


In [10]:
review_after2020['comments'] = review_after2020['comments'].str.strip()
review_after2020 = review_after2020[review_after2020['comments'] != '']

mask = review_after2020['comments'].str.len() >= 5
review_after2020 = review_after2020[mask]

review_after2020.reset_index(drop=True, inplace=True)

In [11]:
# see each listing has how many reviews
review_counts = review_after2020.groupby(['listing_id']).size().reset_index(name='review_count')
review_counts.sort_values(by=['review_count'], ascending=False)

Unnamed: 0,listing_id,review_count
2,39471761,613
0,22215734,579
1,23395506,73


In [12]:
# filter those listings with more than 100 reviews to perform LDA model
review_counts_100 = review_counts[review_counts['review_count'] >= 100]
print(len(review_counts_100))
count_100_ids = review_counts_100['listing_id'].unique()
review_after2020_100 = review_after2020[review_after2020['listing_id'].isin(count_100_ids)]

2


In [13]:
# for those listings with host names more than one word, e.g. Jeff and Shelley, use another method to remove them
review_after2020_100['host_name'] = review_after2020_100['host_name'].apply(lambda s: s.replace(' and', '').replace(' And', ''))

tqdm.pandas()

def remove_host(reviews):
    for index, row in tqdm(reviews.iterrows(), total=len(reviews), desc="Processing rows"):
        names = row['host_name']
        comment = row['comments']
        if ' ' in names:
            names_split = names.split(' ')
            names_split = [item for item in names_split if len(item) > 1]
            if len(names_split) > 1:
                for name in names_split:
                    comment = re.sub(r'\b' + re.escape(name) + r'\b', '', comment, flags=re.IGNORECASE)
                    reviews.at[index, 'comments'] = comment 

remove_host(review_after2020_100)

Processing rows: 100%|██████████| 1192/1192 [00:00<00:00, 26456.32it/s]


In [14]:
stop_words = list(set(nltk.corpus.stopwords.words('english')))

# these words basically don't mean anything, remove them
more = ['stay', 'place', 'would', 'Airbnb', 'airbnb', 'youre', 'time', 'area', 'street', 'bnb', 'BNB', 'B&B', 'b&b',
        'host', 'wife', 'husband', 'absolutely', 'really', 'proximity', 'probably', 'definitely']
stop_words.extend(more)

def clean_text(headline):
    if isinstance(headline, str):
        le = WordNetLemmatizer()
        word_tokens = word_tokenize(headline)
        tokens = [le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w) > 3]
        cleaned_text = " ".join(tokens)
        return cleaned_text
    else:
        return ''

tqdm.pandas()
review_after2020_100['cleaned_text'] = review_after2020_100['comments'].progress_apply(clean_text)

100%|██████████| 1192/1192 [00:01<00:00, 1041.05it/s]


In [15]:
def lda(review):
    # Define stop words and create a TfidfVectorizer
    vect = TfidfVectorizer(stop_words=stop_words, max_features=1000)
    vect_text = vect.fit_transform(review['cleaned_text'])

    # Create a Latent Dirichlet Allocation model
    lda_model = LatentDirichletAllocation(n_components=1, learning_method='online', random_state=42, max_iter=1)
    lda_top = lda_model.fit_transform(vect_text)

    # Get the vocabulary from the TfidfVectorizer
    vocab = vect.get_feature_names_out()

    for i, comp in enumerate(lda_model.components_):
        vocab_comp = list(zip(vocab, comp))
        sorted_words = sorted(vocab_comp, key=lambda x: x[1], reverse=True)[:10]
        topic_words_list = [t[0] for t in sorted_words] 

    return topic_words_list

In [16]:
# run lda model on the listings with more than 100 reviews
unique_ids = list(review_counts_100['listing_id'].unique())
data = {'id': unique_ids, 'topic': [None] * len(unique_ids)}  
listing_topic = pd.DataFrame(data)

for id in tqdm(list(count_100_ids)):
    topic_list = list(set(lda(review_after2020_100[review_after2020_100['listing_id'] == id])))
    topic_string = ', '.join(w for w in topic_list)
    listing_topic.loc[listing_topic['id'] == id, 'topic'] = topic_string

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00, 11.79it/s]


In [17]:
# get the top five topics of each listing
def top5(text):
    if text != None:
        split = text.split(', ')
        five = split[0:5]
        return five

listing_topic['top5'] = listing_topic['topic'].apply(top5)

listing_topic['first'] = listing_topic['top5'].apply(lambda x: x[0] if x else None)
listing_topic['second'] = listing_topic['top5'].apply(lambda x: x[1] if x != None and len(x) > 1 else None)
listing_topic['third'] = listing_topic['top5'].apply(lambda x: x[2] if x != None and len(x) > 2 else None)
listing_topic['fourth'] = listing_topic['top5'].apply(lambda x: x[3] if x != None and len(x) > 3 else None)
listing_topic['fifth'] = listing_topic['top5'].apply(lambda x: x[4] if x != None and len(x) > 4 else None)

# add ', ' so that it looks better on Tableau
listing_topic['first'] = listing_topic['first'].apply(lambda x: x + ', ' if x != None else None)
listing_topic['second'] = listing_topic['second'].apply(lambda x: x + ', ' if x != None else None)
listing_topic['third'] = listing_topic['third'].apply(lambda x: x + ', ' if x != None else None)
listing_topic['fourth'] = listing_topic['fourth'].apply(lambda x: x + ', ' if x != None else None)

In [18]:
# add another column; if safe is one of the topics (no matter which one), mark it
listing_topic['safe'] = ''
listing_topic['safe'] = listing_topic.apply(lambda row: ', safe' if 'safe' in row['topic'] and 'safe' not in row['top5'] else None, axis=1)

listing_topic = listing_topic.drop('top5', axis=1)

In [19]:
# join topic df with the original df
listing_topic_joined = pd.merge(listings, listing_topic, on='id', how='left')

# this column will be used on the tooltip of the map
listing_topic_joined['topic_title'] = ''
listing_topic_joined.loc[listing_topic_joined['topic'].notna(), 'topic_title'] = 'Top Topics of Comments: '

In [20]:
listing_topic_joined.to_csv('data/Airbnb_BasicInfo_check.csv')