### 基于酒店文本描述来推荐相似酒店

In [56]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import random
import cufflinks
from plotly.offline import iplot
cufflinks.go_offline()

In [57]:
df = pd.read_csv('Seattle_Hotels.csv', encoding="latin-1")
df.head()

Unnamed: 0,name,address,desc
0,Hilton Garden Seattle Downtown,"1821 Boren Avenue, Seattle Washington 98101 USA","Located on the southern tip of Lake Union, the..."
1,Sheraton Grand Seattle,"1400 6th Avenue, Seattle, Washington 98101 USA","Located in the city's vibrant core, the Sherat..."
2,Crowne Plaza Seattle Downtown,"1113 6th Ave, Seattle, WA 98101","Located in the heart of downtown Seattle, the ..."
3,Kimpton Hotel Monaco Seattle,"1101 4th Ave, Seattle, WA98101",What?s near our hotel downtown Seattle locatio...
4,The Westin Seattle,"1900 5th Avenue, Seattle, Washington 98101 USA",Situated amid incredible shopping and iconic a...


In [58]:
df.shape

(152, 3)

In [118]:
df['desc'][1]


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



"Located in the city's vibrant core, the Sheraton Grand Seattle provides a gateway to the diverse sights and sounds of the Pacific Northwest. Step out of our front doors to find gourmet dining and bars, world-class shopping, exciting entertainment, and iconic local attractions including the Pike Place Market, Space Needle and Chihuly Garden & Glass Museum. As one of only seven Sheraton hotels in North America to earn the esteemed Grand designation, guests can book confidently knowing they?re receiving the highest benchmark on product and service offerings available. Experience our recently completed multimillion-dollar transformation featuring all new guest rooms, an expanded Sheraton Club Lounge, and modern meeting & event spaces. Gather in our stylish new lobby and enjoy our private art collection featuring local artists while enjoying your favorite beverage from Starbucks. The Sheraton Grand features several dining options including Loulay Kitchen & Bar by James Beard award winning 

### 看一下酒店介绍中主要描述信息

In [60]:
vec = CountVectorizer().fit(df['desc'])
bag_of_words = vec.transform(df['desc'])

In [61]:
bag_of_words.toarray()

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [62]:
bag_of_words.shape

(152, 3200)

In [63]:
sum_words = bag_of_words.sum(axis=0)
sum_words

matrix([[ 1, 11, 11, ...,  2,  6,  2]], dtype=int64)

In [64]:
words_freq = [(word,sum_words[0,idx]) for word,idx in vec.vocabulary_.items()]
words_freq

[('located', 108),
 ('on', 129),
 ('the', 1258),
 ('southern', 1),
 ('tip', 1),
 ('of', 536),
 ('lake', 41),
 ('union', 33),
 ('hilton', 12),
 ('garden', 11),
 ('inn', 89),
 ('seattle', 533),
 ('downtown', 133),
 ('hotel', 295),
 ('is', 271),
 ('perfectly', 6),
 ('for', 216),
 ('business', 87),
 ('and', 1062),
 ('leisure', 18),
 ('neighborhood', 35),
 ('home', 57),
 ('to', 471),
 ('numerous', 1),
 ('major', 12),
 ('international', 32),
 ('companies', 6),
 ('including', 47),
 ('amazon', 19),
 ('google', 6),
 ('bill', 4),
 ('melinda', 4),
 ('gates', 5),
 ('foundation', 4),
 ('wealth', 1),
 ('eclectic', 8),
 ('restaurants', 35),
 ('bars', 7),
 ('make', 43),
 ('this', 63),
 ('area', 51),
 ('one', 75),
 ('most', 40),
 ('sought', 1),
 ('out', 23),
 ('by', 71),
 ('locals', 5),
 ('visitors', 4),
 ('our', 359),
 ('proximity', 8),
 ('allows', 3),
 ('take', 31),
 ('in', 449),
 ('some', 22),
 ('pacific', 42),
 ('northwest', 42),
 ('majestic', 4),
 ('scenery', 2),
 ('enjoy', 93),
 ('outdoor', 23),


In [65]:
words_freq = sorted(words_freq,key=lambda x:x[1],reverse=True)
words_freq

[('the', 1258),
 ('and', 1062),
 ('of', 536),
 ('seattle', 533),
 ('to', 471),
 ('in', 449),
 ('our', 359),
 ('you', 304),
 ('hotel', 295),
 ('with', 280),
 ('is', 271),
 ('at', 231),
 ('from', 224),
 ('for', 216),
 ('your', 186),
 ('or', 161),
 ('center', 151),
 ('are', 136),
 ('downtown', 133),
 ('on', 129),
 ('we', 128),
 ('free', 123),
 ('as', 117),
 ('located', 108),
 ('rooms', 106),
 ('stay', 105),
 ('place', 102),
 ('all', 100),
 ('airport', 99),
 ('space', 97),
 ('market', 97),
 ('enjoy', 93),
 ('an', 91),
 ('pike', 90),
 ('inn', 89),
 ('business', 87),
 ('just', 82),
 ('city', 79),
 ('room', 77),
 ('one', 75),
 ('by', 71),
 ('breakfast', 68),
 ('needle', 68),
 ('suites', 67),
 ('washington', 67),
 ('that', 65),
 ('re', 64),
 ('this', 63),
 ('complimentary', 62),
 ('also', 62),
 ('amenities', 60),
 ('offer', 59),
 ('attractions', 59),
 ('away', 59),
 ('access', 59),
 ('home', 57),
 ('guest', 57),
 ('can', 55),
 ('it', 55),
 ('guests', 54),
 ('service', 53),
 ('experience', 52),

In [66]:
def get_top_n_words(corpus,n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word,sum_words[0,idx]) for word,idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq,key=lambda x:x[1],reverse=True)
    return words_freq[:n]

In [67]:
common_words=get_top_n_words(df['desc'],20)

In [68]:
common_words

[('the', 1258),
 ('and', 1062),
 ('of', 536),
 ('seattle', 533),
 ('to', 471),
 ('in', 449),
 ('our', 359),
 ('you', 304),
 ('hotel', 295),
 ('with', 280),
 ('is', 271),
 ('at', 231),
 ('from', 224),
 ('for', 216),
 ('your', 186),
 ('or', 161),
 ('center', 151),
 ('are', 136),
 ('downtown', 133),
 ('on', 129)]

In [69]:
df1 = pd.DataFrame(common_words,columns=['desc','count'])

In [70]:
df1.head()

Unnamed: 0,desc,count
0,the,1258
1,and,1062
2,of,536
3,seattle,533
4,to,471


In [71]:
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(
        x=df1['count'],
        y=df1['desc'],
        orientation='h'
    )
])

fig.update_layout(
    title='Top 20 Words Before Removing Stopwords',
    xaxis_title='Count',
    yaxis_title='Words',
    height=600
)

fig.show()


In [72]:
def get_top_n_words(corpus,n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word,sum_words[0,idx]) for word,idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq,key=lambda x:x[1],reverse=True)
    return words_freq[:n]

In [73]:
common_words=get_top_n_words(df['desc'],20)
df2 = pd.DataFrame(common_words,columns=['desc','count'])
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(
        x=df2['count'],
        y=df2['desc'],
        orientation='h'
    )
])

fig.update_layout(
    title='Top 20 Words Before Removing Stopwords',
    xaxis_title='Count',
    yaxis_title='Words',
    height=600
)

fig.show()

In [74]:
def get_top_n_words(corpus,n=None):
    vec = CountVectorizer(stop_words='english',ngram_range=(2,2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word,sum_words[0,idx]) for word,idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq,key=lambda x:x[1],reverse=True)
    return words_freq[:n]

In [75]:
common_words=get_top_n_words(df['desc'],20)
df3 = pd.DataFrame(common_words,columns=['desc','count'])
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(
        x=df3['count'],
        y=df3['desc'],
        orientation='h'
    )
])

fig.update_layout(
    title='Top 20 Words Before Removing Stopwords',
    xaxis_title='Count',
    yaxis_title='Words',
    height=600
)

fig.show()

描述的一些统计信息

In [76]:
df['word_count']=df['desc'].apply(lambda x:len(str(x).split()))

In [77]:
df.head()

Unnamed: 0,name,address,desc,word_count
0,Hilton Garden Seattle Downtown,"1821 Boren Avenue, Seattle Washington 98101 USA","Located on the southern tip of Lake Union, the...",184
1,Sheraton Grand Seattle,"1400 6th Avenue, Seattle, Washington 98101 USA","Located in the city's vibrant core, the Sherat...",152
2,Crowne Plaza Seattle Downtown,"1113 6th Ave, Seattle, WA 98101","Located in the heart of downtown Seattle, the ...",147
3,Kimpton Hotel Monaco Seattle,"1101 4th Ave, Seattle, WA98101",What?s near our hotel downtown Seattle locatio...,150
4,The Westin Seattle,"1900 5th Avenue, Seattle, Washington 98101 USA",Situated amid incredible shopping and iconic a...,151


In [78]:
# 使用plotly.express创建直方图
import plotly.express as px

# 创建直方图
fig = px.histogram(
    df,
    x='word_count',
    nbins=50,
    title='Distribution of Word Count in Hotel Descriptions',
    labels={'word_count': 'Word Count', 'count': 'Frequency'},
    height=500
)

# 更新布局
fig.update_layout(
    xaxis_title='Word Count',
    yaxis_title='Frequency',
    bargap=0.1  # 设置柱状图之间的间隔
)

# 显示图表
fig.show()

### 文本处理

In [82]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re

# 编译正则表达式模式
sub_replace = re.compile('[^0-9a-z #+_]')

def clean_txt(text):
    """
    清理文本：
    1. 转换为小写
    2. 移除特殊字符
    3. 移除停用词
    """
    # 确保text是字符串
    text = str(text)
    # 转换为小写
    text = text.lower()
    # 移除特殊字符
    text = sub_replace.sub('', text)
    # 移除停用词
    text = ' '.join(word for word in text.split() if word not in ENGLISH_STOP_WORDS)
    return text

# 应用清理函数
df['desc_clean'] = df['desc'].apply(clean_txt)

# 打印示例查看结果
print("原始文本示例：")
print(df['desc'].iloc[0])
print("\n清理后的文本示例：")
print(df['desc_clean'].iloc[0])

原始文本示例：
Located on the southern tip of Lake Union, the Hilton Garden Inn Seattle Downtown hotel is perfectly located for business and leisure. 
The neighborhood is home to numerous major international companies including Amazon, Google and the Bill & Melinda Gates Foundation. A wealth of eclectic restaurants and bars make this area of Seattle one of the most sought out by locals and visitors. Our proximity to Lake Union allows visitors to take in some of the Pacific Northwest's majestic scenery and enjoy outdoor activities like kayaking and sailing. over 2,000 sq. ft. of versatile space and a complimentary business center. State-of-the-art A/V technology and our helpful staff will guarantee your conference, cocktail reception or wedding is a success. Refresh in the sparkling saltwater pool, or energize with the latest equipment in the 24-hour fitness center. Tastefully decorated and flooded with natural light, our guest rooms and suites offer everything you need to relax and stay produ

In [83]:
df.head()

Unnamed: 0,name,address,desc,word_count,desc_clean
0,Hilton Garden Seattle Downtown,"1821 Boren Avenue, Seattle Washington 98101 USA","Located on the southern tip of Lake Union, the...",184,located southern tip lake union hilton garden ...
1,Sheraton Grand Seattle,"1400 6th Avenue, Seattle, Washington 98101 USA","Located in the city's vibrant core, the Sherat...",152,located citys vibrant core sheraton grand seat...
2,Crowne Plaza Seattle Downtown,"1113 6th Ave, Seattle, WA 98101","Located in the heart of downtown Seattle, the ...",147,located heart downtown seattle awardwinning cr...
3,Kimpton Hotel Monaco Seattle,"1101 4th Ave, Seattle, WA98101",What?s near our hotel downtown Seattle locatio...,150,whats near hotel downtown seattle location bet...
4,The Westin Seattle,"1900 5th Avenue, Seattle, Washington 98101 USA",Situated amid incredible shopping and iconic a...,151,situated amid incredible shopping iconic attra...


In [84]:
df['desc'][0]

"Located on the southern tip of Lake Union, the Hilton Garden Inn Seattle Downtown hotel is perfectly located for business and leisure. \nThe neighborhood is home to numerous major international companies including Amazon, Google and the Bill & Melinda Gates Foundation. A wealth of eclectic restaurants and bars make this area of Seattle one of the most sought out by locals and visitors. Our proximity to Lake Union allows visitors to take in some of the Pacific Northwest's majestic scenery and enjoy outdoor activities like kayaking and sailing. over 2,000 sq. ft. of versatile space and a complimentary business center. State-of-the-art A/V technology and our helpful staff will guarantee your conference, cocktail reception or wedding is a success. Refresh in the sparkling saltwater pool, or energize with the latest equipment in the 24-hour fitness center. Tastefully decorated and flooded with natural light, our guest rooms and suites offer everything you need to relax and stay productive.

In [85]:
df['desc_clean'][0]

'located southern tip lake union hilton garden inn seattle downtown hotel perfectly located business leisure neighborhood home numerous major international companies including amazon google melinda gates foundation wealth eclectic restaurants bars make area seattle sought locals visitors proximity lake union allows visitors pacific northwests majestic scenery enjoy outdoor activities like kayaking sailing 2000 sq ft versatile space complimentary business center stateoftheart av technology helpful staff guarantee conference cocktail reception wedding success refresh sparkling saltwater pool energize latest equipment 24hour fitness center tastefully decorated flooded natural light guest rooms suites offer need relax stay productive unwind bar enjoy american cuisine breakfast lunch dinner restaurant 24hour pavilion pantry stocks variety snacks drinks sundries'

### 相似度计算

In [86]:
df.set_index('name',inplace = True)

In [87]:
tf=TfidfVectorizer(analyzer='word',ngram_range=(1,3),stop_words='english')

In [88]:
tfidf_matrix=tf.fit_transform(df['desc_clean'])

In [89]:
tfidf_matrix.shape

(152, 27001)

In [90]:
cosine_similarity =linear_kernel(tfidf_matrix,tfidf_matrix)

In [91]:
cosine_similarity.shape

(152, 152)

In [108]:
cosine_similarity[0]

array([1.        , 0.01153691, 0.02685919, 0.00788776, 0.02968372,
       0.01211759, 0.01892131, 0.01295234, 0.00746124, 0.01937892,
       0.01370375, 0.00892963, 0.01394245, 0.00571003, 0.00651706,
       0.01680824, 0.01133379, 0.03489329, 0.00864453, 0.02312328,
       0.02879236, 0.00918459, 0.00784411, 0.01586572, 0.01472076,
       0.02402029, 0.02906451, 0.00498255, 0.02487683, 0.02009584,
       0.01528377, 0.02923925, 0.01624488, 0.02207616, 0.03437377,
       0.02737745, 0.00682674, 0.01171681, 0.03182895, 0.03245281,
       0.01770666, 0.01065604, 0.01341233, 0.0263306 , 0.04661615,
       0.01105609, 0.03165515, 0.01330748, 0.03209152, 0.01390424,
       0.02306914, 0.01856613, 0.03500288, 0.01345726, 0.02614755,
       0.01354738, 0.02347766, 0.02973355, 0.01149806, 0.02489298,
       0.0219577 , 0.0102064 , 0.03983313, 0.02891626, 0.02471638,
       0.0121596 , 0.02836503, 0.01141957, 0.01682957, 0.00713343,
       0.02004212, 0.01653828, 0.02307208, 0.02166928, 0.02347

In [109]:
indices = pd.Series(df.index)
indices[:5]

0    Hilton Garden Seattle Downtown
1            Sheraton Grand Seattle
2     Crowne Plaza Seattle Downtown
3     Kimpton Hotel Monaco Seattle 
4                The Westin Seattle
Name: name, dtype: object

In [115]:
def recommendations(name,cosine_similarity):
    recommended_hotels = []
    idx = indices[indices == name].index[0]
    score_series = pd.Series(cosine_similarity[idx]).sort_values(ascending=False)
    top_10_indexes = list(score_series[1:11].index)
    for i in top_10_indexes:
        recommended_hotels.append(list(df.index)[i])
    return recommended_hotels

In [117]:
recommendations('The Westin Seattle',cosine_similarity)

['Renaissance Seattle Hotel',
 'Hyatt Regency Seattle',
 'Quality Inn & Suites Seattle Center',
 'Staybridge Suites Seattle Downtown - Lake Union',
 'Inn at the Market',
 'Residence Inn by Marriott Seattle Downtown/Lake Union',
 'Hyatt Place Seattle',
 'Gand Hyatt Seattle',
 'Silver Cloud Inn - Seattle Lake Union',
 'Crown Inn Motel']