# Read Dataset & Create subset for demo

In [None]:
# Read the dataset into a pandas dataframe

import pandas as pd

filepath = '/Users/hakukazuho/Desktop/Luxurious-Hotel-Review/cleaned-hotel-review-for-ba820/hotel_reviews_cleaned.csv'
df = pd.read_csv(filepath)
df.head()

In [None]:
# Create a new dataframe with only the reviews for the randomly selected hotels

import random
hotel_name_list = list(df['Hotel_Name'].unique())
random.seed(0)
n = 20
random_hotel_names = random.sample(hotel_name_list, n)
subset = df[df['Hotel_Name'].isin(random_hotel_names)]
subset.info()

In [27]:
import nltk

# Function to extract keywords/tags from a positive review
def extract_keywords(review):
    tokens = nltk.word_tokenize(review)
    tagged_tokens = nltk.pos_tag(tokens)
    
    keywords = []
    noun = None
    for word, pos in tagged_tokens:
        if pos.startswith('NN'):  # Look for nouns
            noun = word
        elif pos.startswith('JJ') and noun:  # Look for adjectives describing nouns
            keywords.append(f"{word} {noun}")
            noun = None
    
    return keywords

subset['Keywords'] = subset['Positive_Review'].apply(extract_keywords)

table = subset[['Hotel_Name', 'Positive_Review', 'Keywords']]
table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['Keywords'] = subset['Positive_Review'].apply(extract_keywords)


Unnamed: 0,Hotel_Name,Positive_Review,Keywords
34311,Le Marcel,Spacious room comfortable bed nice view compl...,"[comfortable room, complimentary view, alcohol..."
34312,Le Marcel,Handy for the Gare de l Est and Gare de Nord ...,[quiet Comfortable]
34313,Le Marcel,Staff and location generally,[]
34314,Le Marcel,Good soundproofing and comfortable beds The b...,"[comfortable soundproofing, good bedrooms]"
34315,Le Marcel,Right across from a metro stop with excellent...,"[excellent stop, central connections, comforta..."
...,...,...,...
440640,Abba Garden,Nice place Comfortable beds polite staff,[polite Comfortable]
440641,Abba Garden,Clean Well equipped Close to Camp Nou and met...,[Good Nou]
440642,Abba Garden,Breakfast was nice,[nice Breakfast]
440643,Abba Garden,Room and distance from airport,[]


In [28]:
# Function to extract the noun from a keyword combo of adjective + noun
def extract_noun(keyword):
    tokens = nltk.word_tokenize(keyword)
    tagged_tokens = nltk.pos_tag(tokens)
    nouns = [word for word, pos in tagged_tokens if pos.startswith('NN')]
    return nouns[0] if nouns else None


table['Tags'] = table['Keywords'].apply(lambda keywords: list(set([extract_noun(keyword) for keyword in keywords])))
table['Tags'] = table['Tags'].apply(lambda tags: [tag.capitalize() if tag is not None else None for tag in tags])

tags_by_hotel = table.groupby('Hotel_Name')['Tags'].sum().reset_index()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table['Tags'] = table['Keywords'].apply(lambda keywords: list(set([extract_noun(keyword) for keyword in keywords])))


Unnamed: 0,Hotel_Name,Positive_Review,Keywords,Tags
34311,Le Marcel,Spacious room comfortable bed nice view compl...,"[comfortable room, complimentary view, alcohol...","[drinks, view, room, non]"
34312,Le Marcel,Handy for the Gare de l Est and Gare de Nord ...,[quiet Comfortable],[None]
34313,Le Marcel,Staff and location generally,[],[]
34314,Le Marcel,Good soundproofing and comfortable beds The b...,"[comfortable soundproofing, good bedrooms]","[bedrooms, soundproofing]"
34315,Le Marcel,Right across from a metro stop with excellent...,"[excellent stop, central connections, comforta...","[stop, Room, connections]"
...,...,...,...,...
440640,Abba Garden,Nice place Comfortable beds polite staff,[polite Comfortable],[polite]
440641,Abba Garden,Clean Well equipped Close to Camp Nou and met...,[Good Nou],[Nou]
440642,Abba Garden,Breakfast was nice,[nice Breakfast],[Breakfast]
440643,Abba Garden,Room and distance from airport,[],[]


In [40]:
import nltk

# Function to filter out non-valid nouns from the Tags column
def filter_valid_nouns(tags):
    valid_nouns = []
    for tag in tags:
        if isinstance(tag, str) and nltk.pos_tag([tag])[0][1].startswith('NN'):
            valid_nouns.append(tag)
    return valid_nouns

# Apply the filter_valid_nouns function to the Tags column and store the result in the Valid_Tags column
tags_by_hotel['Valid_Tags'] = tags_by_hotel['Tags'].apply(filter_valid_nouns)

# Display the updated tags_by_hotel dataframe
tags_by_hotel


Unnamed: 0,Hotel_Name,Tags,Valid_Tags
0,Abba Garden,"[People, Superb, Pool, Restaurant, Size, Probl...","[People, Superb, Pool, Restaurant, Size, Probl..."
1,Amba Hotel Marble Arch,"[Beds, Times, Hotel, Helpful, Restaurants, Roo...","[Beds, Times, Hotel, Restaurants, Room, Locati..."
2,Banke H tel,"[Shower, Room, Hotel, Building, Staff, Locatio...","[Shower, Room, Hotel, Building, Staff, Locatio..."
3,Best Western Aulivia Op ra,"[Bathrooms, Paris, Hotel, Rooms, Restaurants, ...","[Bathrooms, Paris, Hotel, Rooms, Restaurants, ..."
4,Courthouse Hotel Shoreditch,"[Extra, Bed, Comfy, Convenient, Staff, Upgrade...","[Bed, Comfy, Convenient, Staff, Upgrade, Bar, ..."
5,Crowne Plaza Barcelona Fira Center,"[Wine, Day, Daughter, Room, Property, Walk, Ro...","[Wine, Day, Daughter, Room, Property, Walk, Ro..."
6,H tel De Sers Champs Elys es Paris,"[Staff, Breakfast, Location, Way, Clean, Staff...","[Staff, Breakfast, Location, Way, Staffs, Loca..."
7,Hotel Schani Wien,"[Lines, Closeness, Tram, Convenient, Hotel, Co...","[Lines, Closeness, Tram, Convenient, Hotel, Co..."
8,Hotel Serhs Rivoli Rambla,"[Groups, Nobody, Hotels, Work, Quiet, Restaura...","[Groups, Nobody, Hotels, Work, Restaurant, Nig..."
9,Hotel du Louvre in the Unbound Collection by H...,"[City, Staff, Days, Clean, Beat, Helpful, Food...","[City, Staff, Days, Beat, Food, Trip, Member, ..."


In [41]:
from matplotlib import pyplot as plt
from ipywidgets import interact, Dropdown
from wordcloud import WordCloud

def generate_word_cloud():
    # Create a dropdown menu to select the hotel name
    hotel_dropdown = Dropdown(options=tags_by_hotel['Hotel_Name'].unique(), description='Hotel Name:')

    # Define a function to generate the word cloud
    def generate_cloud(hotel_name):
        # Filter the dataframe based on the hotel name
        hotel_df = tags_by_hotel[tags_by_hotel['Hotel_Name'] == hotel_name]

        # Combine all the positive tags into a single string
        tags = hotel_df['Valid_Tags'].sum()
        tags_text = ' '.join(tag for tag in tags if tag is not None)

        # Create a word cloud object
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(tags_text)

        # Generate the title for the word cloud
        title = f"The Top Tags of {hotel_name}"

        # Display the word cloud with the title
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(title)
        plt.axis('off')
        plt.show()

    # Interact the dropdown menu with the generate_cloud function
    interact(generate_cloud, hotel_name=hotel_dropdown)


In [42]:
generate_word_cloud()

interactive(children=(Dropdown(description='Hotel Name:', options=('Abba Garden', 'Amba Hotel Marble Arch', 'B…