In [23]:
import pandas as pd
from tqdm import tqdm
import re
import regex
import warnings
warnings.filterwarnings("ignore")

In [24]:
review_after2020 = pd.read_csv('reviews_2020_check_safe.csv', index_col=0)
keywords = pd.read_csv('keywords.csv')

In [25]:
safe_df = review_after2020.groupby('listing_id')['comments'].apply(lambda x: " ".join(x)).reset_index()

In [26]:
safety_keywords = ["safe", "security", "danger", "unsafe", 'safety', 'dangerous']

In [27]:
# see if the comments contain safety-related content
def is_safety_related(review_text):
    for keyword in safety_keywords:
        if re.search(keyword, review_text, flags=re.IGNORECASE):
            return True
    return False

tqdm.pandas()
safe_df['Safety_Related'] = safe_df['comments'].progress_apply(is_safety_related)

100%|██████████| 4/4 [00:00<00:00, 10958.34it/s]


In [28]:
# exclude rows that have gone through keyword extraction
safe_df = safe_df[safe_df['Safety_Related'] == True]

listing_ids_to_remove = keywords['listing_id'].unique()

safe_df = safe_df[~safe_df['listing_id'].isin(listing_ids_to_remove)]
safe_df.drop(columns=['Safety_Related'], inplace=True)
print(len(safe_df))

1


In [29]:
# extract the words around the keyword
def extract_context(text):
    words = re.findall(r'\b\w+\b', text)
    context_lists = []

    for i, word in enumerate(words):
        if word.lower() in safety_keywords:
            start = max(0, i - 3)
            end = min(i + 4, len(words))
            context = " ".join(words[start:end])
            context_lists.append(context)

    return context_lists

safe_df['Safety_Context'] = safe_df['comments'].apply(extract_context)

In [30]:
# determine whether it's unsafe or not
safe_df['not_safe_all'] = None

for index, row in safe_df.iterrows():
    context = row['Safety_Context']
    if len(context) != 0:
        if len(context) == 1:
            if 'safe' in context[0].lower() and 'not' in context[0].lower():
                safe_df.at[index, 'not_safe_all'] = 'Not Safe'
            elif 'unsafe' in context[0].lower() and 'never' not in context[0].lower() and 'not' not in context[0].lower():
                safe_df.at[index, 'not_safe_all'] = 'Not Safe'
            elif 'dangerous' in context[0].lower() and 'never' not in context[0].lower() and 'not' not in context[0].lower():
                safe_df.at[index, 'not_safe_all'] = 'Not Safe'
            
        else:
            all_items_contain_not_safe = all(['safe' in item.lower() and 'not' in item.lower() for item in context])
            all_unsafe = all(['unsafe' in item.lower() and 'never' not in item.lower() and 'not' not in item.lower() for item in context])
            if all_items_contain_not_safe:
                safe_df.at[index, 'not_safe_all'] = 'Not Safe'
            elif all_unsafe:
                safe_df.at[index, 'not_safe_all'] = 'Not Safe'

In [31]:
# join back to the listing file
listings = pd.read_csv('data/Airbnb_keywords.csv', index_col=0)
listing_safety_joined = pd.merge(listings, safe_df[['listing_id', 'not_safe_all']], left_on='id', right_on='listing_id', how='left')
listing_safety_joined.drop(columns=['listing_id'], inplace=True)

In [32]:
# join two safe-related column, will be used on visualization
for index, row in listing_safety_joined.iterrows():
    nots = row['not_safe_all']
    if nots == 'Not Safe':
        listing_safety_joined.at[index, 'safe_all'] = 'Not Safe'
    elif nots == 'Safe':
        listing_safety_joined.at[index, 'safe_all'] = 'Safe'

listing_safety_joined.drop(columns=['not_safe_all'], inplace=True)

In [33]:
listing_safety_joined.to_csv('data/Airbnb_final.csv')
listing_safety_joined

Unnamed: 0,id,listing_url,name,host_id,host_name,latitude,longitude,property_type,room_type,price,...,one,two,three,four,five,six,seven,safe,safe_all,keywords_title
0,7095631,https://www.airbnb.com/rooms/7095631,Condo in Torrance · ★4.70 · 1 bedroom · 1 bed ...,6205097,Irina,33.82065,-118.30587,Private room in condo,Private Room,$35.00,...,,,,,,,,,Not Safe,
1,22215734,https://www.airbnb.com/rooms/22215734,Guest suite in Culver City · ★4.94 · Studio · ...,162330741,Isaac,33.98326,-118.38873,Entire guest suite,Entire Home/Apartment,$119.00,...,"great,","clean,","great place,","great stay,","great location,","nice,",airport,,,Top keywords of Comments:
2,23395506,https://www.airbnb.com/rooms/23395506,Guest suite in Los Angeles · ★4.99 · 1 bedroom...,174427526,Jeff and Shelley,34.04528,-118.42078,Entire guest suite,Entire Home/Apartment,$119.00,...,"hosts,","great,","beautiful,","garden,","amazing,","great place,",amazing hosts,,,Top keywords of Comments:
3,39471761,https://www.airbnb.com/rooms/39471761,Guest suite in Hawthorne · ★4.93 · Studio · 1 ...,153462014,Grant,33.92186,-118.36916,Entire guest suite,Entire Home/Apartment,$142.00,...,"great place,","great,","clean,","great stay,","great location,","nice,",nice place,,,Top keywords of Comments:
