In [183]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import warnings

In [184]:
warnings.filterwarnings('ignore')

In [185]:
data = pd.read_parquet('data.parquet')

In [186]:
data

Unnamed: 0,business name,state,city,latitude,longitude,category,avg_rating,sentiment analysis,year,month,user_id,review_id,business_id,state_id,city_id,category_id
0,Coloma Cottages,California,Coloma,38.799352,-120.885765,Resort hotel,5.0,2,2021,6,1,1,1,1,1,1
1,Coloma Cottages,California,Coloma,38.799352,-120.885765,Hotel,5.0,2,2021,6,1,1,1,1,1,2
2,Coloma Cottages,California,Coloma,38.799352,-120.885765,Indoor lodging,5.0,2,2021,6,1,1,1,1,1,3
3,Coloma Cottages,California,Coloma,38.799352,-120.885765,Inn,5.0,2,2021,6,1,1,1,1,1,4
4,Coloma Cottages,California,Coloma,38.799352,-120.885765,Lodge,5.0,2,2021,6,1,1,1,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84819,Hotel Indigo Nashville,Nevada,Nashville,36.152989,-86.795709,Hotel,3.0,2,2009,4,21122,68896,1571,3,521,2
84820,Hotel Indigo Nashville,Nevada,Nashville,36.152989,-86.795709,Hotel,3.0,0,2010,9,64188,68897,1571,3,521,2
84821,Hotel Indigo Nashville,Nevada,Nashville,36.152989,-86.795709,Hotel,3.0,2,2012,1,64189,68898,1571,3,521,2
84822,Hotel Indigo Nashville,Nevada,Nashville,36.152989,-86.795709,Hotel,3.0,2,2008,10,64190,68899,1571,3,521,2


In [187]:
data = data[['business name', 'business_id', 'state', 'city', 'latitude', 'longitude', 'category', 'avg_rating']]

In [188]:
data

Unnamed: 0,business name,business_id,state,city,latitude,longitude,category,avg_rating
0,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Resort hotel,5.0
1,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Hotel,5.0
2,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Indoor lodging,5.0
3,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Inn,5.0
4,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Lodge,5.0
...,...,...,...,...,...,...,...,...
84819,Hotel Indigo Nashville,1571,Nevada,Nashville,36.152989,-86.795709,Hotel,3.0
84820,Hotel Indigo Nashville,1571,Nevada,Nashville,36.152989,-86.795709,Hotel,3.0
84821,Hotel Indigo Nashville,1571,Nevada,Nashville,36.152989,-86.795709,Hotel,3.0
84822,Hotel Indigo Nashville,1571,Nevada,Nashville,36.152989,-86.795709,Hotel,3.0


In [189]:
data = data[data['category'] != 'Hotel']

In [190]:
data

Unnamed: 0,business name,business_id,state,city,latitude,longitude,category,avg_rating
0,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Resort hotel,5.0
2,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Indoor lodging,5.0
3,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Inn,5.0
4,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Lodge,5.0
5,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Motel,5.0
...,...,...,...,...,...,...,...,...
84624,Santa Barbara Tourist Hostel,1567,Florida,Santa Barbara,34.413094,-119.692407,Hostel,2.2
84626,Santa Barbara Tourist Hostel,1567,Florida,Santa Barbara,34.413094,-119.692407,Hostel,2.2
84628,Santa Barbara Tourist Hostel,1567,Florida,Santa Barbara,34.413094,-119.692407,Hostel,2.2
84630,Santa Barbara Tourist Hostel,1567,Florida,Santa Barbara,34.413094,-119.692407,Hostel,2.2


In [191]:
data = data.reset_index(drop=True)

In [192]:
data

Unnamed: 0,business name,business_id,state,city,latitude,longitude,category,avg_rating
0,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Resort hotel,5.0
1,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Indoor lodging,5.0
2,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Inn,5.0
3,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Lodge,5.0
4,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Motel,5.0
...,...,...,...,...,...,...,...,...
29709,Santa Barbara Tourist Hostel,1567,Florida,Santa Barbara,34.413094,-119.692407,Hostel,2.2
29710,Santa Barbara Tourist Hostel,1567,Florida,Santa Barbara,34.413094,-119.692407,Hostel,2.2
29711,Santa Barbara Tourist Hostel,1567,Florida,Santa Barbara,34.413094,-119.692407,Hostel,2.2
29712,Santa Barbara Tourist Hostel,1567,Florida,Santa Barbara,34.413094,-119.692407,Hostel,2.2


In [193]:
data = data.drop_duplicates(subset=['business name', 'category'])

In [194]:
data

Unnamed: 0,business name,business_id,state,city,latitude,longitude,category,avg_rating
0,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Resort hotel,5.0
1,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Indoor lodging,5.0
2,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Inn,5.0
3,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Lodge,5.0
4,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Motel,5.0
...,...,...,...,...,...,...,...,...
29597,Mira Vista Resort,1525,Florida,Tucson,32.341444,-111.116306,Resort hotel,4.5
29610,Lehmann House Bed & Breakfast,1533,Nevada,Saint Louis,38.618421,-90.215006,Bed & Breakfast,4.4
29619,Vandyke Bed & Beverage,1545,Florida,Nashville,36.177200,-86.749783,Bed & Breakfast,4.0
29648,Lions Inn,1561,Florida,New Orleans,29.963970,-90.052454,Bed & Breakfast,4.5


In [195]:
data['category'] = data.groupby('business name')['category'].transform(lambda x: ' '.join(x))

In [196]:
data

Unnamed: 0,business name,business_id,state,city,latitude,longitude,category,avg_rating
0,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Resort hotel Indoor lodging Inn Lodge Motel Mo...,5.0
1,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Resort hotel Indoor lodging Inn Lodge Motel Mo...,5.0
2,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Resort hotel Indoor lodging Inn Lodge Motel Mo...,5.0
3,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Resort hotel Indoor lodging Inn Lodge Motel Mo...,5.0
4,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Resort hotel Indoor lodging Inn Lodge Motel Mo...,5.0
...,...,...,...,...,...,...,...,...
29597,Mira Vista Resort,1525,Florida,Tucson,32.341444,-111.116306,Resort hotel,4.5
29610,Lehmann House Bed & Breakfast,1533,Nevada,Saint Louis,38.618421,-90.215006,Bed & Breakfast,4.4
29619,Vandyke Bed & Beverage,1545,Florida,Nashville,36.177200,-86.749783,Bed & Breakfast,4.0
29648,Lions Inn,1561,Florida,New Orleans,29.963970,-90.052454,Bed & Breakfast,4.5


In [197]:
data = data.drop_duplicates(subset=['business name', 'business_id', 'state', 'city', 'latitude', 'longitude', 'avg_rating'])

In [198]:
data = data.reset_index(drop=True)

In [199]:
data

Unnamed: 0,business name,business_id,state,city,latitude,longitude,category,avg_rating
0,Coloma Cottages,1,California,Coloma,38.799352,-120.885765,Resort hotel Indoor lodging Inn Lodge Motel Mo...,5.0
1,Mt. Shasta Nordic Center,2,California,Mt Shasta,41.307792,-122.212527,Ski resort,4.8
2,Surf Inn,3,California,San Clemente,33.399696,-117.594588,Motel,3.6
3,Rio Palace Motor Inn,4,California,Los Angeles,33.941136,-118.277916,Motel,3.7
4,Palm Motel,5,California,Santa Ana,33.717215,-117.867441,Motel,4.0
...,...,...,...,...,...,...,...,...
770,Mira Vista Resort,1525,Florida,Tucson,32.341444,-111.116306,Resort hotel,4.5
771,Lehmann House Bed & Breakfast,1533,Nevada,Saint Louis,38.618421,-90.215006,Bed & Breakfast,4.4
772,Vandyke Bed & Beverage,1545,Florida,Nashville,36.177200,-86.749783,Bed & Breakfast,4.0
773,Lions Inn,1561,Florida,New Orleans,29.963970,-90.052454,Bed & Breakfast,4.5


In [200]:
ml_model = data[['business name', 'business_id', 'avg_rating']]

In [201]:
ml_model

Unnamed: 0,business name,business_id,avg_rating
0,Coloma Cottages,1,5.0
1,Mt. Shasta Nordic Center,2,4.8
2,Surf Inn,3,3.6
3,Rio Palace Motor Inn,4,3.7
4,Palm Motel,5,4.0
...,...,...,...
770,Mira Vista Resort,1525,4.5
771,Lehmann House Bed & Breakfast,1533,4.4
772,Vandyke Bed & Beverage,1545,4.0
773,Lions Inn,1561,4.5


In [202]:
ml_model.to_parquet('ml_model.parquet', index=False)

In [203]:
data.to_parquet('machine_learning.parquet', index=False)

In [204]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [205]:
tfidf_businessname = tfidf_vectorizer.fit_transform(data['business name'])

In [206]:
tfidf_businessname = tfidf_businessname.toarray()

In [207]:
tfidf_state = tfidf_vectorizer.fit_transform(data['state'])
tfidf_state = tfidf_state.toarray()

In [208]:
tfidf_city = tfidf_vectorizer.fit_transform(data['city'])
tfidf_city = tfidf_city.toarray()

In [209]:
tfidf_category = tfidf_vectorizer.fit_transform(data['category'])
tfidf_category = tfidf_category.toarray()

In [210]:
textual_ft = np.hstack((tfidf_businessname, tfidf_state, tfidf_city, tfidf_category))

In [211]:
textual_ft.shape

(775, 1792)

In [212]:
svd = TruncatedSVD(n_components=4, random_state=42)
reduced_features = svd.fit_transform(textual_ft)

In [213]:
print(reduced_features.shape)

(775, 4)


In [214]:
cosine_sim = cosine_similarity(reduced_features, reduced_features)

In [215]:
cosine_sim.shape

(775, 775)

In [216]:
indices = np.argsort(-cosine_sim, axis=1)[:, 1:6]

In [217]:
np.save('cos_sim.npy', indices)

In [218]:
def recomendacion_hotel(nombre_hotel: str):
    recomendacionhotel = pd.read_parquet('ml_model.parquet')
    nombre_hotel = nombre_hotel.lower()
    if nombre_hotel not in recomendacionhotel['business name'].str.lower().values:
        error_message = {"Error": "Nombre de negocio incorrecto"}
        return error_message
    path = 'cos_sim.npy'
    cosine_sim = np.load(path)
    idx = recomendacionhotel[recomendacionhotel['business name'].str.lower() == nombre_hotel].index[0]
    rec_hoteles = recomendacionhotel.iloc[cosine_sim[idx]]
    rec_hoteles = rec_hoteles.sort_values(by='avg_rating', ascending=False)
    info = {'recomendaciones': None}
    info['recomendaciones'] = list(rec_hoteles['business name'])
    return info

In [219]:
pd.read_parquet('ml_model.parquet')

Unnamed: 0,business name,business_id,avg_rating
0,Coloma Cottages,1,5.0
1,Mt. Shasta Nordic Center,2,4.8
2,Surf Inn,3,3.6
3,Rio Palace Motor Inn,4,3.7
4,Palm Motel,5,4.0
...,...,...,...
770,Mira Vista Resort,1525,4.5
771,Lehmann House Bed & Breakfast,1533,4.4
772,Vandyke Bed & Beverage,1545,4.0
773,Lions Inn,1561,4.5


In [220]:
recomendacion_hotel('Mt. Shasta Nordic Center')

{'recomendaciones': ['Campus By the Sea',
  "Edith Palmer's Country Inn",
  'Carriage House of New Hope',
  'Chimney Hill Estate Inn',
  'Americas Best Value Inn & Suites Wine Country']}