# Diverse Selection Methods
When combining outputs from different recommendation models, we provided two methods: weighted average ranking and diverse selection. While weighted average ranking is calculating the average rank of restaurants (while MF model is given more priority due to better performance), diverse selection is an option when users want to see more variety of restaurants being recommended. 

For the diverse selection method, we would utilize restaurants information (description, categories, etc.) and tf idf score to calculate similarity among recommended restaurants. We would first select the restaurant that has the highest average similarity across all restaurants (most representative of all recommended restaurants by different models), and use that as a starting point, iteratively pick restaurants that are least similar to the selected one(s).

In [1]:
import pandas as pd
import numpy as np
import ast
import re
import random

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/liusiyi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/liusiyi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/liusiyi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/liusiyi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
restaurant_w_train_ave_stars_df = pd.read_csv('restaurant_w_train_ave_stars.csv')

In [3]:
def convert_list_str(text):
    if pd.isna(text) or text == []:  # Handle empty lists or NaN
        return ""
    if isinstance(text, np.ndarray):  # Convert numpy array to list first
        text = text.tolist()
    if isinstance(text, list):  # Handle lists (including converted numpy arrays)
        return ' '.join(map(str, text))
    if isinstance(text, str):  # Handle string representations of lists
        try:
            items = ast.literal_eval(text)
            if isinstance(items, list):
                return ' '.join(map(str, items))
            return str(items)
        except (ValueError, SyntaxError):
            return text
    return str(text)  # Fallback for other types

def create_full_description(df, text_features, list_features):
    # Fill missing values
    features = text_features + list_features
    df[features] = df[features].fillna('')

    # Fill missing editorial_summary with types + categories (without commas)
    df['editorial_summary'] = df['editorial_summary'].apply(lambda x: x if x.strip() != '' else '')
    # df['types'] = df['types'].str.replace(',', ' ')
    df['categories'] = df['categories'].str.replace(',', ' ')
    df.loc[df['editorial_summary'].str.strip() == '', 'editorial_summary'] = (
        df['categories']
    )
    df['Music'] = df['Music'].str.replace('False', '')
    df['Music'] = df['Music'].str.replace('True', '')

    # Convert list-formatted column to plain text
    for col in list_features:
        df[col] = df[col].apply(convert_list_str)

    # Remove commas from all text features
    for col in features:
        df[col] = df[col].str.replace(',', ' ')

    # if len(bool_features) != 0:
    #     # Convert boolean columns to descriptive tags
    #     for col in bool_features:
    #         df[col] = df[col].apply(lambda x: f"{col.lower()}" if x == True else "")

    # Combine all into a single description column
    df['description'] = df[features]\
        .agg(' '.join, axis=1)\
        .str.replace(r'\s+', ' ', regex=True)\
        .str.strip()

    return df

In [4]:
text_features = ['categories', 'editorial_summary',
                 'RestaurantsAttire', 'Ambience',
                 'NoiseLevel', 'Music', 'city']

list_features = ['GoodForMeal', 'BusinessParking']

# print(restaurant_w_train_ave_stars_df[list_features])

item_df = restaurant_w_train_ave_stars_df.copy()
item_df = create_full_description(item_df, text_features, list_features)
pd.set_option('display.max_colwidth', None)
item_df[['business_id', 'description']].head(5)

Unnamed: 0,business_id,description
0,--epgcb7xHGuJ-4PUeSLAw,Restaurants Food Bagels Sandwiches Breakfast & Brunch Counter-serve chain specializing in NYC-style bagels & coffee plus salads soups & sandwiches. Willow Grove breakfast lot
1,-0FX23yAacC4bbLaGPvyxw,American (Traditional) Restaurants American (Traditional) Restaurants classy average Newtown dinner dessert lunch lot valet
2,-0TffRSXXIlBYVbb5AwfTg,Cocktail Bars Food Delivery Services Nightlife Breakfast & Brunch Food Bars Event Planning & Services Caterers Restaurants Indian Modern & traditional Indian fare plus a tasting menu vegetarian eats & brunch served in sleek digs. casual classy average Philadelphia dinner lunch brunch garage street validated
3,-1B9pP_CrRBJYPICE5WbRA,Asian Fusion Restaurants American (New) Thai Szechuan Chinese Noodles fried rice & other classic Asian mains offered in a streamlined restaurant with a bar. casual trendy classy average Philadelphia lunch dinner street
4,-3725FZiIIYdwQtM4MKEIA,Pizza Sandwiches Chicken Wings Restaurants Delivery/carryout chain offering a wide range of pizzas & a variety of other dishes & sides. casual average Glen Mills


In [5]:
def process_text(text):
    text_lower = text.lower()
    tokens = word_tokenize(text_lower)
    alphabetic_tokens = [word for word in tokens if re.match('^[a-zA-Z]+$', word)]
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    negation_words = ['not', 'no', 'never', "n't", "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't",
                      "doesn't", "don't", "didn't", "won't", "wouldn't", "shan't", "shouldn't", "mustn't", "can't", "cannot"]
    tokens_no_stopwords = [word for word in alphabetic_tokens if ((word not in stop_words) or (word in negation_words))]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens_no_stopwords]
    processed_text = ' '.join(lemmatized_words)

    return processed_text

In [6]:
tfidf_vectorizer = TfidfVectorizer()
item_df['description'] = item_df['description'].apply(lambda x: process_text(x))
item_df

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liusiyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liusiyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liusiyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liusiyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liusiyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liusiyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liusiyi/nltk_data...
[nltk_data]   Package stopwords is already up-t

Unnamed: 0,business_id,city,postal_code,categories,yelp_rating,name,RestaurantsTakeOut,BusinessAcceptsCreditCards,RestaurantsDelivery,price_level,...,good_for_watching_sports,menu_for_children,parking_options,editorial_summary_language,weighted_ave_stars,latitude,longitude,review_count,hours,description
0,--epgcb7xHGuJ-4PUeSLAw,Willow Grove,19090,Restaurants Food Bagels Sandwiches Breakfast & Brunch,3.0,Manhattan Bagel,True,True,True,1.0,...,False,False,"['freeParkingLot', 'freeStreetParking']",en,2.230112,40.145054,-75.116293,34,"{'Monday': '7:0-14:0', 'Tuesday': '7:0-14:0', 'Wednesday': '7:0-14:0', 'Thursday': '7:0-14:0', 'Friday': '7:0-14:0', 'Saturday': '7:0-14:0', 'Sunday': '7:0-14:0'}",restaurant food bagel sandwich breakfast brunch chain specializing bagel coffee plus salad soup sandwich willow grove breakfast lot
1,-0FX23yAacC4bbLaGPvyxw,Newtown,18940,American (Traditional) Restaurants,3.5,The Grey Stone Fine Food and Spirits,True,True,True,3.0,...,False,True,"['freeParkingLot', 'freeStreetParking', 'valetParking']",,3.212927,40.256082,-74.916976,155,"{'Tuesday': '12:0-20:0', 'Wednesday': '12:0-20:0', 'Thursday': '12:0-20:0', 'Friday': '12:0-21:0', 'Saturday': '12:0-21:0', 'Sunday': '12:0-20:0'}",american traditional restaurant american traditional restaurant classy average newtown dinner dessert lunch lot valet
2,-0TffRSXXIlBYVbb5AwfTg,Philadelphia,19107,Cocktail Bars Food Delivery Services Nightlife Breakfast & Brunch Food Bars Event Planning & Services Caterers Restaurants Indian,4.5,IndeBlue Modern Indian Food & Spirits,True,True,True,2.0,...,False,False,"['freeParkingLot', 'freeStreetParking', 'no freeGarageParking']",en,3.563777,39.948508,-75.161969,1097,"{'Monday': '0:0-0:0', 'Tuesday': '16:0-22:0', 'Wednesday': '16:0-22:0', 'Thursday': '16:0-22:0', 'Friday': '16:0-23:0', 'Saturday': '16:0-23:0', 'Sunday': '16:0-22:0'}",cocktail bar food delivery service nightlife breakfast brunch food bar event planning service caterer restaurant indian modern traditional indian fare plus tasting menu vegetarian eats brunch served sleek dig casual classy average philadelphia dinner lunch brunch garage street validated
3,-1B9pP_CrRBJYPICE5WbRA,Philadelphia,19107,Asian Fusion Restaurants American (New) Thai Szechuan Chinese,4.0,Spice 28,True,True,True,2.0,...,False,False,[],en,2.885079,39.950352,-75.161583,822,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:30', 'Wednesday': '11:0-22:30', 'Thursday': '11:0-22:30', 'Friday': '11:0-23:30', 'Saturday': '12:0-23:30', 'Sunday': '12:0-22:0'}",asian fusion restaurant american new thai szechuan chinese noodle fried rice classic asian main offered streamlined restaurant bar casual trendy classy average philadelphia lunch dinner street
4,-3725FZiIIYdwQtM4MKEIA,Glen Mills,19342,Pizza Sandwiches Chicken Wings Restaurants,1.5,Domino's Pizza,True,True,True,1.0,...,False,False,"['freeParkingLot', 'freeStreetParking']",en,1.348674,39.883915,-75.536518,20,"{'Monday': '10:30-0:0', 'Tuesday': '10:30-0:0', 'Wednesday': '10:30-0:0', 'Thursday': '10:30-0:0', 'Friday': '10:30-1:0', 'Saturday': '10:30-1:0', 'Sunday': '10:30-0:0'}",pizza sandwich chicken wing restaurant chain offering wide range pizza variety dish side casual average glen mill
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8064,zwd4dyQ5ovnjVojWfAuhMw,Philadelphia,19106,Belgian Food Specialty Food Restaurants Ethnic Food Sandwiches,4.0,European Republic,True,True,True,1.0,...,False,False,"['paidStreetParking', 'no valetParking']",en,2.911927,39.948649,-75.144631,239,"{'Monday': '11:30-22:0', 'Tuesday': '11:30-22:0', 'Wednesday': '11:30-22:0', 'Thursday': '11:30-22:0', 'Friday': '11:30-23:0', 'Saturday': '12:0-23:0', 'Sunday': '12:0-21:0'}",belgian food specialty food restaurant ethnic food sandwich casual spot sandwich wrap belgian fry served choice dipping sauce casual casual average philadelphia lunch dinner street
8065,zxRmQ_FWVowh8rlzLCSURQ,Philadelphia,19111,Pizza Restaurants,3.0,Verree Express Pizza,True,True,True,2.0,...,False,False,['freeStreetParking'],,2.626504,40.072409,-75.076047,24,,pizza restaurant pizza restaurant casual philadelphia
8066,zxY4DgtXsVHihSUpsmwamg,Philadelphia,19148,Food Do-It-Yourself Food Restaurants Italian,4.5,P & S Ravioli,True,,False,2.0,...,False,False,['freeStreetParking'],,3.308876,39.928473,-75.161983,6,,food food restaurant italian food food restaurant italian casual average philadelphia
8067,zzXRdzrVhfNWPHD2MeyWeA,Lansdale,19446,Restaurants Coffee & Tea Food Chicken Wings Gas Stations Automotive,2.0,Royal Farms,True,True,True,,...,False,False,"['freeParkingLot', 'freeStreetParking']",en,1.716347,40.245754,-75.342402,14,,restaurant coffee tea food chicken wing gas station automotive chain serving breakfast sandwich fried chicken deli item lansdale lot


In [7]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_scores = tfidf_vectorizer.fit_transform(item_df['description'])
print('tf-idf shape:', tfidf_scores.shape)

tf-idf shape: (8069, 2619)


In [8]:
from scipy import sparse

sparse.save_npz("ds_matrix.npz", tfidf_scores)