In [13]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD, NMF
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize, MWETokenizer



import re
import string

In [2]:
df = pd.read_csv('hotels2.csv', index_col=0)

In [3]:
df.drop(df.index[576], inplace=True)
df.reset_index(drop=True, inplace=True)

In [4]:
south_sf = []
for i in df.address:
    if 'South San Francisco' in i:
        south_sf.append(df[df.address == i].index.values)

In [5]:
for i in south_sf:
    df.iloc[i, 1] = np.nan
    
df.dropna(inplace=True)

df.reset_index(drop=True, inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995 entries, 0 to 994
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   hotel           995 non-null    object
 1   address         995 non-null    object
 2   star            995 non-null    object
 3   text            995 non-null    object
 4   city            995 non-null    object
 5   area_star       995 non-null    object
 6   total_descript  995 non-null    object
dtypes: object(7)
memory usage: 54.5+ KB


In [9]:
# make all text lowercase
df['area_star_processed'] = df.area_star.apply(lambda x: x.lower())

# remove punctuation
df.area_star_processed = (df.area_star_processed
                     .apply(
                         lambda x: 
                         re.sub('[%s]' %re.escape(string.punctuation),'', x)))

# remove numbers
df.area_star_processed = (df.area_star_processed
                     .apply(lambda x: re.sub('\w*\d\w*', '', x)))

# remove '\n' and '–'
df.area_star_processed = (df.area_star_processed
                     .apply(lambda x: re.sub('[\n–]', '', x)))

In [10]:
# get corpus
area_star_corpus = []
for text in df.area_star_processed:
    area_star_corpus.append(text)

area_star_corpus

['the st clair hotel  magnificent mile is located in near north side a neighborhood in chicago and is near a metro station and near the beach navy pier and cloud gate are notable landmarks and the areas natural beauty can be seen at grant park and millennium park check out an event or a game at soldier field and consider making time for lincoln park zoo a top attraction not to be missed guests appreciate the hotels central location  two stars',
 'best western grant park hotel is located in the loop a neighborhood in chicago and is near a metro station willis tower and navy pier are notable landmarks and the areas natural beauty can be seen at grant park and millennium park field museum of natural history and john g shedd aquarium are also worth visiting kayaking windsurfing and sailing offer great chances to get out on the surrounding water or you can seek out an adventure with hikingbiking trails nearby guests love the hotels location for the sightseeing its also convenient to public 

In [11]:
# tfidf vectorizer
area_star_cv = TfidfVectorizer(stop_words='english')
area_star_X = area_star_cv.fit_transform(area_star_corpus).toarray()
area_star_cv_df = pd.DataFrame(area_star_X, columns=area_star_cv.get_feature_names())
area_star_cv_df

Unnamed: 0,abbot,abri,ac,academy,ace,acme,activities,activity,adagio,addamsmedill,...,youre,zachary,zelos,zephyr,zeppelin,zetta,ziplining,zoe,zoo,ändra
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.122569,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.098755,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.140430,0.0,0.0,...,0.227726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
991,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
992,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.237455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
993,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


In [15]:
# nmf
area_star_doc_word = area_star_cv.fit_transform(area_star_corpus)

nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(area_star_doc_word) 
doc_topic.shape



(995, 5)

In [16]:
area_star_topic_word = pd.DataFrame(nmf_model.components_.round(3),
               index = ['topic_0','topic_1','topic_2','topic_3','topic_4'],
             columns = area_star_cv.get_feature_names())
area_star_topic_word

Unnamed: 0,abbot,abri,ac,academy,ace,acme,activities,activity,adagio,addamsmedill,...,youre,zachary,zelos,zephyr,zeppelin,zetta,ziplining,zoe,zoo,ändra
topic_0,0.0,0.0,0.012,0.0,0.006,0.0,0.15,0.153,0.0,0.0,...,0.024,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.305,0.0
topic_1,0.0,0.007,0.0,0.055,0.0,0.0,0.104,0.089,0.007,0.0,...,0.016,0.0,0.005,0.006,0.007,0.008,0.0,0.006,0.0,0.0
topic_2,0.0,0.0,0.008,0.0,0.008,0.006,0.053,0.0,0.001,0.017,...,0.027,0.008,0.0,0.001,0.0,0.0,0.006,0.002,0.039,0.0
topic_3,0.04,0.0,0.009,0.007,0.007,0.0,0.122,0.047,0.0,0.0,...,0.006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0
topic_4,0.0,0.0,0.0,0.0,0.01,0.0,0.134,0.094,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.073,0.017


In [17]:
# get top n topic words
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [18]:
display_topics(nmf_model, area_star_cv.get_feature_names(), 10)


Topic  0
st, square, new, york, station, midtown, minutes, madison, central, park

Topic  1
san, francisco, national, downtown, oracle, park, historical, maritime, great, ferry

Topic  2
chicago, field, beach, north, water, near, michigan, loop, pavilion, soldier

Topic  3
los, angeles, hollywood, studios, center, museum, monica, santa, staples, beach

Topic  4
seattle, waterfront, lumen, center, lake, union, climate, pledge, place, looking


In [21]:
Vt = pd.DataFrame(doc_topic.round(5),
             index = df.index,
             columns = ['topic_0','topic_1','topic_2','topic_3','topic_4'])

In [22]:
VT = doc_topic.round(5)

In [23]:
VT

array([[0.03103, 0.01319, 0.14421, 0.00686, 0.00027],
       [0.03557, 0.00977, 0.15193, 0.00222, 0.00461],
       [0.00394, 0.00183, 0.17456, 0.0028 , 0.     ],
       ...,
       [0.00345, 0.24512, 0.     , 0.     , 0.     ],
       [0.00123, 0.15946, 0.02004, 0.00526, 0.01443],
       [0.     , 0.26968, 0.     , 0.00069, 0.00857]])

In [24]:
VT.shape

(995, 5)

In [25]:
Vt

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4
0,0.03103,0.01319,0.14421,0.00686,0.00027
1,0.03557,0.00977,0.15193,0.00222,0.00461
2,0.00394,0.00183,0.17456,0.00280,0.00000
3,0.00000,0.00000,0.23217,0.01437,0.00000
4,0.03206,0.00316,0.16644,0.00558,0.00110
...,...,...,...,...,...
990,0.00000,0.24002,0.00000,0.00000,0.00000
991,0.00000,0.31796,0.00000,0.00000,0.00000
992,0.00345,0.24512,0.00000,0.00000,0.00000
993,0.00123,0.15946,0.02004,0.00526,0.01443


In [26]:
def get_recom(VT, hotelID, num_recom):
    VT_df = pd.DataFrame(VT)
    VT_df['city'] = df.city
    
    rec_list = []
    for hotel in range(VT.shape[0]):
        if VT_df.iloc[hotelID, VT_df.shape[1]-1] == VT_df.iloc[hotel, VT_df.shape[1]-1]:
            if hotel != hotelID:
                rec_list.append([hotel, np.dot(VT[hotelID], VT[hotel])])
    top_rec = [i[0] for i in sorted(rec_list, key=lambda x: x[1], reverse=True)]
    final_rec = top_rec[:num_recom]
    return df.iloc[final_rec][['hotel', 'address', 'star']]

## Make some recommendations

In [27]:
get_recom(VT, 220, 5)

Unnamed: 0,hotel,address,star
217,Quality Inn and Suites Seattle Center Downtown,"618 John Street, Seattle, WA, 98109",three stars
223,Hyatt Place Seattle Downtown,"110 6th Ave N, Seattle, WA, 98109",three stars
197,"Staypineapple, Hotel FIVE, Downtown Seattle","2200 Fifth Avenue, Seattle, WA, 98121",three stars
198,"Staypineapple, The Maxwell Hotel, Seattle Cent...","300 Roy St, Seattle, WA, 98109",three stars
233,The Alexis Royal Sonesta Hotel Seattle,"1007 1st Ave, Seattle, WA, 98104",four stars


In [28]:
df.iloc[220, :3]

hotel                       Oakwood at Via 6
address    2121 6th Ave., Seattle, WA, 98121
star                             three stars
Name: 220, dtype: object

The above recommendation is great.  Just a little bit off with the last one on star rating.

In [29]:
get_recom(VT, 750, 5)

Unnamed: 0,hotel,address,star
668,Wilshire Crest Hotel Los Angeles,"6301 Orange St, Los Angeles, CA, 90048",two stars
656,"Holiday Inn Express West Los Angeles, an IHG H...","11250 Santa Monica Blvd, Los Angeles, CA, 90025",two stars
698,Good Nite Inn West Los Angeles-Century City,"10740 Santa Monica Blvd, Los Angeles, CA, 90025",two stars
695,"Antonio Hotel - Downtown Los Angeles, near Hol...","229 North Soto St., Los Angeles, CA, 90033",two stars
789,"Hotel Indigo Los Angeles Downtown, an IHG Hotel","899 Francisco Street, Los Angeles, CA, 90015",four stars


In [30]:
df.iloc[750, :3]

hotel                             Garden Suite Hotel
address    681 S Western Ave, Los Angeles, CA, 90005
star                                     three stars
Name: 750, dtype: object

2 and 5 not good.
1, 3, and 4, so so 

In [31]:
get_recom(VT, 700, 5)

Unnamed: 0,hotel,address,star
668,Wilshire Crest Hotel Los Angeles,"6301 Orange St, Los Angeles, CA, 90048",two stars
656,"Holiday Inn Express West Los Angeles, an IHG H...","11250 Santa Monica Blvd, Los Angeles, CA, 90025",two stars
698,Good Nite Inn West Los Angeles-Century City,"10740 Santa Monica Blvd, Los Angeles, CA, 90025",two stars
695,"Antonio Hotel - Downtown Los Angeles, near Hol...","229 North Soto St., Los Angeles, CA, 90033",two stars
789,"Hotel Indigo Los Angeles Downtown, an IHG Hotel","899 Francisco Street, Los Angeles, CA, 90015",four stars


In [32]:
df.iloc[700, :3]

hotel                                 DTLA Hotel
address    1123 W 7th St, Los Angeles, CA, 90017
star                                   two stars
Name: 700, dtype: object

1, 4, 5 bad.
2, 3 nah

In [45]:
get_recom(VT, 620, 5)

Unnamed: 0,hotel,address,star
495,The James New York - NoMad,"22 East 29 Street, New York, NY, 10016",four stars
496,citizenM New York Times Square,"218 West 50th Street, New York, NY, 10019",four stars
472,W New York - Times Square,"1567 Broadway At 47th St, New York, NY, 10036",four stars
305,Hilton Garden Inn Times Square,"790 Eighth Ave., New York, NY, 10019",three stars
431,Hotel Five44,"544 West 48 St, New York, NY, 10036",three stars


In [46]:
df.iloc[620, :3]

hotel                                   Royalton Park Avenue
address    420 Park Avenue South, 29th Street, New York, ...
star                                              five stars
Name: 620, dtype: object

not good

In [47]:
get_recom(VT, 900, 5)

Unnamed: 0,hotel,address,star
991,"The Ritz-Carlton, San Francisco","600 Stockton St, San Francisco, CA, 94108",five stars
814,The Urban,"507 Bush Street, San Francisco, CA, 94108",two stars
966,"Hotel Zetta San Francisco, a Viceroy Urban Ret...","55 5th St, San Francisco, CA, 94103",four stars
811,Winsor Hotel,"20 6th Street, San Francisco, CA, 94103",one star
978,The Marker San Francisco,"501 Geary St, San Francisco, CA, 94102",four stars


In [48]:
df.iloc[900, :3]

hotel      Holiday Inn Express and Suites Fisherman's Wha...
address             550 N Point St, San Francisco, CA, 94133
star                                             three stars
Name: 900, dtype: object

1, 2, 5 so so. 
4 so so, but star rating is way off.
3 bad

In [49]:
get_recom(VT, 960, 5)

Unnamed: 0,hotel,address,star
991,"The Ritz-Carlton, San Francisco","600 Stockton St, San Francisco, CA, 94108",five stars
814,The Urban,"507 Bush Street, San Francisco, CA, 94108",two stars
966,"Hotel Zetta San Francisco, a Viceroy Urban Ret...","55 5th St, San Francisco, CA, 94103",four stars
811,Winsor Hotel,"20 6th Street, San Francisco, CA, 94103",one star
978,The Marker San Francisco,"501 Geary St, San Francisco, CA, 94102",four stars


In [51]:
df.iloc[960, :3]

hotel      Hotel Zeppelin San Francisco, a Viceroy Urban ...
address                545 Post St, San Francisco, CA, 94102
star                                              four stars
Name: 960, dtype: object

1, 3, 5 Ok.
4 ok, but star rating is way off.
2 so so, but star rating is off

In [61]:
get_recom(VT, 85, 5)

Unnamed: 0,hotel,address,star
109,Omni Chicago Suites - Magnificent Mile,"676 N Michigan Ave, Chicago, IL, 60611",four stars
110,Swissotel - Chicago,"323 E Wacker Dr, Chicago, IL, 60601",four stars
120,Virgin Hotels Chicago,"203 N Wabash Ave, Chicago, IL, 60601",four stars
101,Hyatt Centric Chicago Magnificent Mile,"633 N Saint Clair St, Chicago, IL, 60611",four stars
168,The Peninsula Chicago,"108 East Superior Street, Chicago, IL, 60611",five stars


In [60]:
df.iloc[85, :3]

hotel                                     Pendry Chicago
address    230 North Michigan Avenue, Chicago, IL, 60601
star                                          four stars
Name: 85, dtype: object

2 , 3 ok.
1 , 5 so so. 
4 nah

In [62]:
get_recom(VT, 4, 5)

Unnamed: 0,hotel,address,star
109,Omni Chicago Suites - Magnificent Mile,"676 N Michigan Ave, Chicago, IL, 60611",four stars
110,Swissotel - Chicago,"323 E Wacker Dr, Chicago, IL, 60601",four stars
120,Virgin Hotels Chicago,"203 N Wabash Ave, Chicago, IL, 60601",four stars
101,Hyatt Centric Chicago Magnificent Mile,"633 N Saint Clair St, Chicago, IL, 60611",four stars
168,The Peninsula Chicago,"108 East Superior Street, Chicago, IL, 60611",five stars


In [63]:
df.iloc[4, :3]

hotel         Best Western River North Hotel
address    125 W Ohio St, Chicago, IL, 60654
star                               two stars
Name: 4, dtype: object

1, 2 ok.
3, 4, 5 so so. 
but the star rating is off for all of them.