In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, MWETokenizer

import re
import string

In [2]:
df = pd.read_csv('hotels2.csv', index_col=0)

In [3]:
df.drop(df.index[576], inplace=True)
df.reset_index(drop=True, inplace=True)

In [4]:
# Make a list of obervations where address is 'South San Francisco'
south_sf = []
for i in df.address:
    if 'South San Francisco' in i:
        south_sf.append(df[df.address == i].index.values)

In [5]:
# Delete the obervations where address is 'South San Francisco'
# in order to only keep address with 'San Francisco'
for i in south_sf:
    df.iloc[i, 1] = np.nan
    
df.dropna(inplace=True)

df.reset_index(drop=True, inplace=True)

In [6]:
# make all text lowercase
df['area_star_processed'] = df.area_star.apply(lambda x: x.lower())

# remove punctuation
df.area_star_processed = (df.area_star_processed
                     .apply(
                         lambda x: 
                         re.sub('[%s]' %re.escape(string.punctuation),'', x)))

# remove numbers
df.area_star_processed = (df.area_star_processed
                     .apply(lambda x: re.sub('\w*\d\w*', '', x)))

# remove '\n' and '–'
df.area_star_processed = (df.area_star_processed
                     .apply(lambda x: re.sub('[\n–]', '', x)))


# Remove terms that appear in many of the combined description
# because it is interfering with the topic modeling. They are
# showing up as topics rather than terms that are more 
# meaningful
def remove_terms(terms):
    for term in terms:
        df.area_star_processed = (df.area_star_processed
                                  .apply(lambda x: 
                                         x.replace(
                                             term, '')))


         
terms = ['hikingbiking trails', 'cultural highlights', 'area', 'worth', 
         'attractions', 'neighborhood', 'event', 'game', 'located', 'include',
         'looking', 'natural', 'beauty', 'enjoy', 'town', 'visit', 'agenda', 
         'hotels', 'wishing', 'near', 'nearby', 'whats', 'checking', 'kayaking',
         'shopping', 'consider', 'adventure', 'adventures', 'seek', 'activity',
         'activities', 'hotel', 'public', 'transportation', 'guests', 'night',
         'love', 'foot', 'location', 'discover', 'city', 'great', 'club', 'clubs',
         'going', 'golf', 'landmarks', 'explore', 'water', 'local', 'traveler', 
         'want', 'shop', 'minutes', 'inn', 'spend', 'appreciate', 'notable', 'happen', 
         'convenience', 'experience', 'stop', 'good', 'seen', 'metro', 'station',
         'music', 'beach', 'maritime', 'pacific', 'shrine', 'auditorium', 'theater',
         'bell', 'national', 'science', 'hollywood', 'county', 'california', 'center',
         'one star', 'two stars', 'three stars', 'four stars', 'five stars']

remove_terms(terms)

In [7]:
# get corpus of combined star rating and area description
area_star_corpus = []
for text in df.area_star_processed:
    area_star_corpus.append(text)

In [8]:
# tokenize compound attraction terms
from nltk.tokenize import word_tokenize, MWETokenizer

mwe_tokenizer = MWETokenizer([('new', 'york'), ('los', 'angeles'), 
('san', 'francisco'), ('oracle', 'park'), ('chase', 'center'),('millennium', 'park'),
('bay', 'ferry', 'building'), ('berkeley', 'marina'), ('westfield', 'san', 'francisco', 'centre'),
('ferry', 'building', 'marketplace'), ('one', 'star'), ('two', 'stars'), ('three', 'stars'), ('four', 'stars'),
('five', 'stars'), ('ghirardelli', 'square'), ('san', 'franciso', 'maritime', 'national', 'historical', 'park'),
('cable', 'car', 'museum'), ('fort', 'mason'), ('alcatraz', 'island'),
('aquarium', 'of', 'the', 'bay'), ('embarcadero', 'center'),
('muir', 'woods', 'national', 'monument'), ('financial', 'district'),
('lombard', 'street'), ('golden', 'gate', 'park'), ('hillsdale', 'shopping', 'center'),
('coyote', 'point', 'park'), ('half', 'moon', 'bay', 'parkside', 'aquatic', 'park'),
('casanova', 'park'), ('grace', 'cathedral'), ('painted', 'ladies'),
('burlingame', 'museum'), ('san', 'franciso', 'museum', 'of', 'modern', 'art'),
('california', 'academy', 'of', 'sciences'), ('santa', 'monica', 'pier'),
('hollywood', 'walk', 'of', 'fame'), ('the', 'grove'), ('universal', 'citywalk'),
('universal', 'studios'), ('venice', 'beach'), ('airport', 'proximity'),
('hermosa', 'beach', 'pier'), ('muscle', 'beach'),
('topanga', 'state', 'park'), ('dockweiler', 'state', 'beach'),
('petersen', 'automotive', 'museum'), ('el', 'corazon'),
('los', 'angeles', 'county', 'museum', 'of', 'art'), ('farmers', 'market'),
('melrose', 'avenue'), ('city', 'center'),
('nethercutt', 'museum'), ('rancho', 'camulos'), ('six', 'flags'),
('gibbon', 'conservation', 'center'), ('los', 'angeles', 'international', 'airport'),
('el', 'capitan', 'theatre'), ('wilson', 'park'), ('south', 'botanic', 'garden'),
('redondo', 'beach', 'pier'), ('del', 'amo', 'fashion', 'center'), 
('toyota', 'sports', 'center'), ('knots', 'berry', 'farm'), ('disneyland'),
('old', 'town', 'pasadena'), ('griffith', 'observatory'), ('rose', 'bowl', 'stadium'),
('warner', 'brothers', 'studio'), ('pantages', 'theatre'), ('hollywood', 'and', 'vine'),
('pershing', 'square'), ('los', 'angeles', 'state', 'historic', 'park'), 
('grammy', 'museum'), ('natural', 'history', 'museum'), ('pasadena', 'museum'),
('azusa', 'greens', 'coutnry', 'club'), ('los', 'angeles', 'equestrian', 'center'), 
('entertainment', 'district'), ('paramount', 'studios'), 
('hollywood', 'wax', 'museum'), ('city', 'center'), ('lumen', 'field'), 
('climate', 'pledge', 'arena'), ('showbox', 'sodo'), ('family', 'fun', 'center'),
('starfire', 'sports', 'complex'), ('renton', 'memorial', 'stadium'), 
('showare', 'center'), ('seattle', 'paramount', 'theatre'), ('avenue', 'theater'),
('seattle', 'great', 'wheel'), ('climate', 'pledge', 'arena'), ('hydroplane', 'and', 'raceboat', 'museum'),
('great', 'american', 'casino'), ('silver', 'dollar', 'casino'), ('central', 'park'),
('museum', 'of', 'modern', 'are'), ('radio', 'city', 'music', 'hall'), ('bryant', 'park'),
('central', 'park', 'zoo'), ('american', 'museum', 'of', 'natural', 'history'),
('washington', 'square', 'park'), ('battery', 'park'), ('barclays', 'center'),
('empire', 'state', 'building'), ('time', 'square'), ('herald', 'square'),
('madison', 'square', 'garden'), ('penn', 'station'), ('chelsea', 'market'),
('flatiorn', 'building'), ('meadowlands', 'sports', 'complex'), ('st', 'james', 'theatre'),
('river', 'north'), ('lake', 'michigan'), ('lakefront', 'trail'), ('skydeck', 'ledge'),
('navy', 'pier'), ('the', 'loop'), ('business', 'district'), ('chicago', 'childrens', 'museum'),
('state', 'street'), ('michigan', 'avenue'), ('field', 'museum', 'of', 'natural', 'history'),
('chicago', 'riverwalk'), ('harris', 'theater'), ('chicago', 'cultural', 'center'),
('frank', 'lloyd', 'wright', 'historic', 'district'), ('frank', 'lloyd', 'wright', 'home'),
('rainbow', 'falls', 'waterpark'), ('parkway', 'bank', 'park', 'entertainment', 'district'),
('kohl', 'childrens', 'museum'), ('fashion', 'outlets', 'of', 'chicago'),
('harlem', 'irving', 'plaza'), ('lincoln', 'park', 'conservatory'),
('peggy', 'notebaert', 'nature', 'museum'), ('promenade', 'bolingbrook'),
('sea', 'lion', 'aquatic', 'park'), ('lake', 'katherine', 'nature', 'center'),
('arboretum', 'of', 'south', 'barrington'), ('woodfield', 'mall'), ('santas', 'village'),
('schaumburg', 'medieval', 'times'), ('legoland', 'discovery', 'center'),
('raupp', 'memorial', 'museum'), ('grove', 'national', 'historic', 'landmark'),
('northbrook', 'sports', 'complex'), ('wagner', 'farm'), ('lilacia', 'park'),
('naper', 'settlement', 'museum'), ('cosley', 'zoo'), ('edge', 'ice', 'arena'),
('seatgeek', 'stadium'), ('beach', 'navy', 'pier'), ('cloud', 'gate'),
('grant', 'park'), ('millennium', 'park'), ('soldier', 'field'), ('seattle', 'aquarium'),
('tmobile', 'park'), ('moore', 'theater'), ('gum', 'wall'), ('lincoln', 'square'),
('marymoor', 'park'), ('husky', 'stadium'), ('bellevue', 'square'), ('coulon', 'memorial', 'park'),
('lakeridge', 'park'), ('seattle', 'art', 'museum'), ('original', 'starbucks'),
('climate', 'pledge', 'arena'), ('seatle', 'wheel'), ('washington', 'park', 'arboretum'),
('woodland', 'park', 'zoo'), ('renton', 'memorial', 'statium'), ('sofi', 'stadium'),
('pike', 'place', 'market'), ('space', 'needle'), ('staples', 'center')])

mwe_corpus = []
for i in area_star_corpus:
    mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(i))
    mwe_corpus.append(mwe_tokens)

In [9]:
df['area_star_tokenized'] = [i for i in mwe_corpus]

In [10]:
# remove punctuation
df.area_star_tokenized = (df.area_star_tokenized
                          .apply(lambda x: ' '.join(x)))

In [11]:
# get tokenized corpus
area_star_corpus_tokenized = []
for text in df.area_star_tokenized:
    area_star_corpus_tokenized.append(text)

In [12]:
# Tfidf Vectorizer to create a sparse matrix
area_star_cv = TfidfVectorizer(stop_words='english')
area_star_X = area_star_cv.fit_transform(area_star_corpus_tokenized).toarray()
area_star_cv_df = pd.DataFrame(area_star_X, columns=area_star_cv.get_feature_names())
area_star_cv_df

Unnamed: 0,abbot,abri,ac,academy,ace,acme,adagio,addamsmedill,addition,adler,...,youre,zachary,zelos,zephyr,zeppelin,zetta,ziplining,zoe,zoo,ändra
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.236161,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.295512,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.275828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.296071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


In [13]:
area_star_doc_word = area_star_cv.fit_transform(area_star_corpus_tokenized)

area_star_lsa = TruncatedSVD(n_components=5)
area_star_doc_topic = area_star_lsa.fit_transform(area_star_doc_word) 
area_star_lsa.explained_variance_ratio_

array([0.01887646, 0.04498003, 0.03229632, 0.02729887, 0.02242692])

In [14]:
area_star_topic_word = pd.DataFrame(area_star_lsa.components_.round(3),
               index = ['topic_0','topic_1','topic_2','topic_3','topic_4'],
             columns = area_star_cv.get_feature_names())
area_star_topic_word

Unnamed: 0,abbot,abri,ac,academy,ace,acme,adagio,addamsmedill,addition,adler,...,youre,zachary,zelos,zephyr,zeppelin,zetta,ziplining,zoe,zoo,ändra
topic_0,0.002,0.001,0.006,0.01,0.004,0.0,0.001,0.002,0.004,0.003,...,0.015,0.0,0.001,0.001,0.001,0.001,0.0,0.001,0.01,0.001
topic_1,0.002,0.003,-0.003,0.028,-0.0,0.0,0.003,0.002,0.011,0.002,...,0.006,0.0,0.002,0.003,0.003,0.004,0.0,0.003,0.013,0.001
topic_2,0.005,-0.001,0.004,-0.007,0.006,0.002,-0.001,0.009,-0.003,0.017,...,0.01,0.004,-0.001,-0.0,-0.0,-0.002,0.002,-0.001,0.034,0.002
topic_3,0.005,0.001,-0.001,-0.002,0.005,-0.001,-0.0,-0.002,-0.001,-0.006,...,-0.007,-0.002,-0.0,-0.0,-0.0,-0.001,-0.001,-0.0,-0.008,0.012
topic_4,0.027,0.001,0.004,0.002,-0.0,-0.0,-0.0,-0.002,0.001,-0.002,...,0.003,-0.002,-0.0,-0.0,0.0,-0.001,-0.0,-0.001,0.004,-0.003


In [15]:
# get top 10 topic words
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [16]:
display_topics(area_star_lsa, area_star_cv.get_feature_names(), 10)


Topic  0
new_york, st, san_francisco, mid, square, times, madison_square_garden, st_james_theatre, park, ing

Topic  1
san_francisco, oracle_park, historical, pier, ferry, muir, monument, woods, exploratorium, chase

Topic  2
chicago, pavilion, uic, wrigley, soldier_field, seattle, field, navy_pier, tower, the_loop

Topic  3
seattle, lumen_field, lake, union, climate_pledge_arena, el_corazon, pike_place_market, tmobile_park, wheel, westlake

Topic  4
los_angeles, staples, museum, santa_monica_pier, venice, la, pantages_theatre, belasco, wax, paramount_studios


In [17]:
Vt = pd.DataFrame(area_star_doc_topic.round(5),
             index = df.index,
             columns = ['topic_0','topic_1','topic_2','topic_3','topic_4'])

In [18]:
VT = area_star_doc_topic.round(5)

VT

array([[ 0.13923,  0.06427,  0.27214, -0.0541 , -0.03183],
       [ 0.12166,  0.06458,  0.34204, -0.06145, -0.01632],
       [ 0.12348,  0.04936,  0.39403, -0.11169, -0.01508],
       ...,
       [ 0.22245,  0.52312, -0.12618, -0.03792, -0.03477],
       [ 0.15057,  0.32544, -0.03065,  0.01772, -0.02837],
       [ 0.25872,  0.4439 , -0.1038 ,  0.00312,  0.02489]])

In [19]:
VT.shape

(995, 5)

In [20]:
# get top 5 recommendations
def get_recommendation(VT, hotelID, num_recom):
    VT_df = pd.DataFrame(VT)
    VT_df['city'] = df.city
    
    rec_list = []
    for hotel in range(VT.shape[0]):
        if VT_df.iloc[hotelID, VT_df.shape[1]-1] == VT_df.iloc[hotel, VT_df.shape[1]-1]:
            if hotel != hotelID:
                rec_list.append([hotel, np.dot(VT[hotelID], VT[hotel])])
    top_rec = [i[0] for i in sorted(rec_list, key=lambda x: x[1], reverse=True)]
    final_rec = top_rec[:num_recom]
    return df.iloc[final_rec][['hotel', 'address', 'star']]

In [21]:
get_recommendation(VT, 220, 5)

Unnamed: 0,hotel,address,star
223,Hyatt Place Seattle Downtown,"110 6th Ave N, Seattle, WA, 98109",three stars
217,Quality Inn and Suites Seattle Center Downtown,"618 John Street, Seattle, WA, 98109",three stars
197,"Staypineapple, Hotel FIVE, Downtown Seattle","2200 Fifth Avenue, Seattle, WA, 98121",three stars
222,Hilton Seattle,"1301 6th Avenue, Seattle, WA, 98101",three stars
202,"Holiday Inn Seattle Downtown, an IHG Hotel","211 Dexter Ave N, Seattle, WA, 98109",three stars


In [22]:
df.iloc[220, :3]

hotel                       Oakwood at Via 6
address    2121 6th Ave., Seattle, WA, 98121
star                             three stars
Name: 220, dtype: object

The above are great

In [25]:
get_recommendation(VT, 830, 5)

Unnamed: 0,hotel,address,star
909,Inn At Union Square,"440 Post St, San Francisco, CA, 94102",three stars
991,"The Ritz-Carlton, San Francisco","600 Stockton St, San Francisco, CA, 94108",five stars
897,Courtyard by Marriott San Francisco Union Square,"761 Post Street, San Francisco, CA, 94109",three stars
832,Union Square Plaza Hotel,"432 Geary St, San Francisco, CA, 94102",two stars
957,Le Meridien San Francisco,"333 Battery St, San Francisco, CA, 94111",four stars


In [26]:
df.iloc[830, :3]

hotel                                 Powell Place
address    730 Powell St, San Francisco, CA, 94108
star                                     two stars
Name: 830, dtype: object

These are bad reccommendations

Recommendation system with TruncatedSVD gave extreme results; one was excellent, and the other was way off.