In [1]:
import pandas as pd

all_reviews = pd.read_csv("all_reviews_export.csv")
reviews_df = pd.read_csv("reviews_df_export.csv")


# top_reviews = pd.read_csv("top_reviews_export.csv")
# bottom_reviews = pd.read_csv("bottom_reviews_export.csv")


In [5]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m102.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
all_grouped = (
    all_reviews.groupby("listing_id")["comments"]
    .apply(lambda x: " ".join(map(str, x)))
    .reset_index()
)

all_grouped.columns = ["listing_id", "combined_reviews"]


In [7]:
merged = all_grouped.merge(
    reviews_df[["listing_id", "group"]],
    on="listing_id",
    how="left"
)


In [8]:
import spacy
nlp = spacy.load("en_core_web_sm")
def spacy_tokens(text):
    if not isinstance(text, str):
        return []
    doc = nlp(text.lower())
    return [token.lemma_ for token in doc
            if token.is_alpha and not token.is_stop]

merged["tokens"] = merged["combined_reviews"].apply(spacy_tokens)


In [9]:
top_tokens = merged[merged["group"] == "Top 5"]["tokens"]
bottom_tokens = merged[merged["group"] == "Bottom 5"]["tokens"]


In [10]:
import pandas as pd

top_words = pd.Series([t for tokens in top_tokens for t in tokens])
top_freq = top_words.value_counts().head(20)
print("Top Keywords:\n", top_freq)
bottom_words = pd.Series([t for tokens in bottom_tokens for t in tokens])
bottom_freq = bottom_words.value_counts().head(20)
print("Bottom Keywords:\n", bottom_freq)


Top Keywords:
 stay           134
place           94
host            57
love            44
great           42
lisa            39
beautiful       39
space           37
need            36
recommend       32
feel            30
clean           30
definitely      29
comfortable     28
airbnb          27
peaceful        27
perfect         27
location        27
time            26
jenny           26
Name: count, dtype: int64
Bottom Keywords:
 stay        18
de          17
la          15
host        15
airbnb      11
place       11
night       11
le           7
location     7
bathroom     7
friendly     6
très         6
room         6
shower       6
parking      6
et           6
à            5
pay          5
day          5
bed          5
Name: count, dtype: int64


In [12]:
def extract_adjs(text):
    doc = nlp(text.lower())
    return [token.lemma_ for token in doc if token.pos_ == "ADJ"]

merged["adjectives"] = merged["combined_reviews"].apply(extract_adjs)


In [13]:
top_adj_list = [a for row in merged[merged["group"] == "Top 5"]["adjectives"] for a in row]
top_adj_series = pd.Series(top_adj_list)
top_adj_freq = top_adj_series.value_counts().head(20)

print("Top adjectives:\n", top_adj_freq)


Top adjectives:
 great          42
beautiful      38
clean          30
comfortable    28
perfect        27
peaceful       27
amazing        24
good           24
wonderful      23
friendly       21
quiet          20
responsive     19
private        17
helpful        17
easy           16
lovely         16
many           15
more           13
little         12
short          11
Name: count, dtype: int64


In [14]:
bottom_adj_list = [a for row in merged[merged["group"] == "Bottom 5"]["adjectives"] for a in row]
bottom_adj_series = pd.Series(bottom_adj_list)
bottom_adj_freq = bottom_adj_series.value_counts().head(20)

print("Bottom adjectives:\n", bottom_adj_freq)


Bottom adjectives:
 friendly         6
basic            4
nous             4
dirty            3
old              3
hot              3
airbnb           3
overnight        3
responsive       3
safe             3
nice             3
great            3
clean            3
quiet            3
dead             2
beautiful        2
other            2
busy             2
bad              2
communicative    2
Name: count, dtype: int64


In [16]:
theme_words = {
    "cleanliness": ["clean", "dirty", "dust", "mold", "spotless"],
    "noise": ["noise", "noisy", "quiet", "sound", "loud"],
    "communication": [ "communication", "reply", "responsive", "friendly"],
    "accuracy": ["accurate", "misleading", "photo", "description"],
    "comfort": ["comfortable", "bed", "cozy", "warm", "spacious"],
}

def theme_counts(tokens):
    return {theme: sum(t in words for t in tokens)
            for theme, words in theme_words.items()}

merged["themes"] = merged["tokens"].apply(theme_counts)


In [17]:
top_theme_df = pd.DataFrame(list(merged[merged["group"]=="Top 5"]["themes"]))
bottom_theme_df = pd.DataFrame(list(merged[merged["group"]=="Bottom 5"]["themes"]))

print("Top Theme Averages:\n", top_theme_df.mean())
print("Bottom Theme Averages:\n", bottom_theme_df.mean())


Top Theme Averages:
 cleanliness       6.8
noise             4.6
communication     9.6
accuracy          0.4
comfort          12.4
dtype: float64
Bottom Theme Averages:
 cleanliness      1.8
noise            1.4
communication    2.0
accuracy         0.6
comfort          1.2
dtype: float64
