# Recommendation System 1.0 (Current status)

## Downloading & Cleaning Data

### Downloading Original Dataset from Kaggle

In [1]:
!pip install opendatasets



In [5]:
# Eric's Kaggle API key
# "username":"ericthedataguy",
# "key":"875c0de0faea2fefa081c78eb470a347"
import opendatasets as od
import pandas as pd

od.download(
    "https://www.kaggle.com/datasets/jiashenliu/515k-hotel-reviews-data-in-europe")

df = pd.read_csv("515k-hotel-reviews-data-in-europe/Hotel_Reviews.csv")

Skipping, found downloaded files in "./515k-hotel-reviews-data-in-europe" (use force=True to force download)


### Fill in geographical data using Google API

In [7]:
from geopy.geocoders import GoogleV3
import pandas as pd
missing_geo_hotel = list(df[df['lat'].isna()]['Hotel_Name'].value_counts().reset_index()['Hotel_Name'])
missing_geo_address = list(df[df['lat'].isna()]['Hotel_Address'].value_counts().reset_index()['Hotel_Address'])

# Create a geocoder object
geolocator = GoogleV3(api_key='AIzaSyCo0MJ4SypoxliSIn-yyNG4F_eCFncRXoU')

# Define a function to get the latitude and longitude of an address
def get_coordinates(address):
    location = geolocator.geocode(address)
    if location:
        return location.latitude, location.longitude
    else:
        return None

# Get the coordinates for each address in missing_geo_address
coordinates = [get_coordinates(address) for address in missing_geo_address]

# Create a new dataframe with hotel name, longitude, and latitude
new_df = pd.DataFrame({'Hotel_Name': missing_geo_hotel, 'Longitude': [coord[1] if coord else None for coord in coordinates], 'Latitude': [coord[0] if coord else None for coord in coordinates]})

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515738 entries, 0 to 515737
Data columns (total 17 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   Hotel_Address                               515738 non-null  object 
 1   Additional_Number_of_Scoring                515738 non-null  int64  
 2   Review_Date                                 515738 non-null  object 
 3   Average_Score                               515738 non-null  float64
 4   Hotel_Name                                  515738 non-null  object 
 5   Reviewer_Nationality                        515738 non-null  object 
 6   Negative_Review                             515738 non-null  object 
 7   Review_Total_Negative_Word_Counts           515738 non-null  int64  
 8   Total_Number_of_Reviews                     515738 non-null  int64  
 9   Positive_Review                             515738 non-null  object 
 

In [12]:
# Ensure both key columns are of the same data type, here assuming 'Hotel_Name' column exists in `df`
df['Hotel_Name'] = df['Hotel_Name'].astype(str)  # Adjust if 'Hotel_Name' is the actual column you want to merge on in `df`
new_df['Hotel_Name'] = new_df['Hotel_Name'].astype(str)

# Merge using columns instead of trying to merge an index with a column
df_filled = df.merge(new_df, left_on='Hotel_Name', right_on='Hotel_Name', how='left')
df_filled['Latitude'] = df_filled['Latitude'].fillna(df_filled['lat'])
df_filled['Longitude'] = df_filled['Longitude'].fillna(df_filled['lng'])
df_filled = df_filled.drop(columns=['lat', 'lng'])

## Preprocessing Data for Recommendation System

### Subsetting random 10k rows (1729)

In [15]:
subset = df_filled.sample(n=10000, random_state=1729)
subset.to_csv('10k_subset.csv', index=False)
subset = pd.read_csv('10k_subset.csv')
subset.describe()

Unnamed: 0,Additional_Number_of_Scoring,Average_Score,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Longitude,Latitude
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,497.1283,8.40858,18.4914,2717.1875,17.7066,7.2255,8.38638,2.899397,49.450665
std,509.107355,0.545186,30.42837,2330.16349,22.248506,11.131291,1.644268,4.691857,3.442442
min,6.0,6.4,0.0,54.0,0.0,1.0,2.5,-0.369758,41.328376
25%,168.0,8.1,0.0,1145.0,5.0,1.0,7.5,-0.142745,48.214662
50%,337.0,8.5,9.0,2061.0,11.0,3.0,8.8,0.019886,51.499981
75%,639.0,8.8,23.0,3598.0,22.0,8.0,9.6,4.834975,51.516048
max,2682.0,9.6,386.0,16670.0,367.0,355.0,10.0,16.445799,52.400181


### Extract all tags from Positive_Review for each review

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Function to extract keywords/tags from a positive review
def extract_keywords(review):
    tokens = nltk.word_tokenize(review)
    tagged_tokens = nltk.pos_tag(tokens)
    keywords = []
    noun = None
    for word, pos in tagged_tokens:
        if pos.startswith('NN'):  # Look for nouns
            noun = word
        elif pos.startswith('JJ') and noun:  # Look for adjectives describing nouns
            keywords.append(f"{word} {noun}")
            noun = None
    return keywords

subset['Keywords'] = subset['Positive_Review'].apply(extract_keywords)

### Pool tags for each hotel & add hotel name (groupby hotel name)

In [None]:
# Function to extract the noun from a keyword combo of adjective + noun
def extract_noun(keyword):
    tokens = nltk.word_tokenize(keyword)
    tagged_tokens = nltk.pos_tag(tokens)
    nouns = [word for word, pos in tagged_tokens if pos.startswith('NN')]
    return nouns[0] if nouns else None


subset['text_tags'] = subset['Keywords'].apply(lambda keywords: list(set([extract_noun(keyword) for keyword in keywords])))
subset['text_tags'] = subset['text_tags'].apply(lambda tags: [tag.capitalize() if tag is not None else None for tag in tags])

subset['transactions'] = subset.apply(lambda row: [row['Hotel_Name']] + row['text_tags'], axis=1)

### Filter & keep the most popular 10 tags for each hotel

In [None]:
# Find the top 10 tags for each hotel
from collections import Counter

table1 = subset.groupby('Hotel_Name')['transactions'].sum().reset_index()

hotel_lists = {}

for hotel_name, transactions in table1.groupby('Hotel_Name')['transactions']:
    transaction_list = transactions.tolist()
    flattened_list = [item for sublist in transaction_list for item in sublist]
    counter = Counter(flattened_list)
    top_50_elements = [element for element, count in counter.most_common(10)]
    hotel_lists[hotel_name] = top_50_elements

In [None]:
top_tags = pd.DataFrame({
    'Hotel_Name': list(hotel_lists.keys()),
    'Top_Tags': list(hotel_lists.values())
})

merged_table = subset.merge(top_tags, on='Hotel_Name')

# Check each item in the list in transactions and add to a new list if it's in the list in Top_Tags
merged_table['new_transactions'] = merged_table.apply(lambda row: [item for item in row['transactions'] if item in row['Top_Tags']], axis=1)
merged_table.head()

## Recommendation System

### Conduct Market Basket Analysis 

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder


# apply transactionencoder
te = TransactionEncoder()
merged_table['new_transactions'] = merged_table['new_transactions'].apply(lambda x: [item for item in x if item is not None])
te_ary = te.fit(merged_table['new_transactions']).transform(merged_table['new_transactions'])
df_te = pd.DataFrame(te_ary, columns=te.columns_)

# create a unique list of hotel name
antecedents_hotel = merged_table['Hotel_Name'].unique()

# apply Apriori algorithm
frequent_itemsets = apriori(df_te, min_support=0.0000001, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)
rules['antecedents'] = rules['antecedents'].apply(lambda x: set(x))
rules['consequents'] = rules['consequents'].apply(lambda x: set(x))

# filter rules
single_antecedent_consequent_rules = rules[(rules['antecedents'].map(len) == 1) &
                                           (rules['consequents'].map(len) == 1) &
                                           (rules['antecedents'].apply(lambda x: list(x)[0]).isin(antecedents_hotel))]

rules_table = single_antecedent_consequent_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

### Function that returns most popular tags for given hotel name

In [None]:
# Function that finds the most popular tags of chosen hotel

def popular_tags(hotel_name):
    print(rules_table[rules_table['antecedents'] == {hotel_name}]['consequents'])

popular_tags('Hotel Arena')

# Recommendation System 2.0 (Improvement)

### Plan for improvement

**Better text-to-tags results**

1. Try n-grams

2. Try TF-IDF

3. Try Sentiment Analysis

**Better user experiences**

1. Word cloud with dropdown

2. Input tags -> output hotel name

**Other issues:**

1. Subset using random seed

2. Adjust rules

3. Choose top tags first, then add hotel name

### Better text-tags results

#### Current version

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Function to extract keywords/tags from a positive review
def extract_keywords(review):
    tokens = nltk.word_tokenize(review)
    tagged_tokens = nltk.pos_tag(tokens)
    keywords = []
    noun = None
    for word, pos in tagged_tokens:
        if pos.startswith('NN'):  # Look for nouns
            noun = word
        elif pos.startswith('JJ') and noun:  # Look for adjectives describing nouns
            keywords.append(f"{word} {noun}")
            noun = None
    return keywords

subset['Keywords'] = subset['Positive_Review'].apply(extract_keywords)

In [19]:
from nltk import word_tokenize, pos_tag, download
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Ensure required NLTK resources are downloaded
download('punkt')
download('averaged_perceptron_tagger')
download('wordnet')
download('stopwords')

# Review text
review_text = 

# Preprocessing: Lowercasing
review_text = review_text.lower()

# Tokenization
tokens = word_tokenize(review_text)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

# POS Tagging
tagged_tokens = pos_tag(lemmatized_tokens)

# Extracting Nouns and Adjectives
nouns_and_adjectives = [token for token, tag in tagged_tokens if tag.startswith('NN') or tag.startswith('JJ')]

# Applying TF-IDF on extracted nouns and adjectives
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform([' '.join(nouns_and_adjectives)])
feature_names = vectorizer.get_feature_names_out()

# Converting TF-IDF result to a readable format
tfidf_scores = tfidf_matrix.toarray().flatten()
tfidf_scores_dict = dict(zip(feature_names, tfidf_scores))

# Sorting words by their TF-IDF scores
sorted_tfidf = sorted(tfidf_scores_dict.items(), key=lambda item: item[1], reverse=True)

sorted_tfidf


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hakukazuho/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/hakukazuho/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hakukazuho/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hakukazuho/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[('staff', 0.5773502691896258),
 ('chatty', 0.2886751345948129),
 ('friendly', 0.2886751345948129),
 ('funny', 0.2886751345948129),
 ('group', 0.2886751345948129),
 ('helpful', 0.2886751345948129),
 ('hostile', 0.2886751345948129),
 ('hotel', 0.2886751345948129),
 ('young', 0.2886751345948129)]

#### Try n-grams

#### Try TF-IDF

#### Try Sentiment Analysis