In [1]:
SHEET_NAME = 'Sheet1'
FILE_NAME = 'hotel_content.xlsx'
import numpy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
import os
path = 'data' + os.sep + FILE_NAME
df_news = pd.read_excel(path, sheet_name=SHEET_NAME)
df_news.head()

Unnamed: 0,url,theme,content
0,https://www.oyorooms.com/id/40374-oyo-rooms-oy...,RoomSpider - Room,Your browser javascript is disabled please ena...
1,http://www.hotelplatinum.co.in,RoomSpider - Room,hotelplatinuminfo gmail com 91 89610 50299 801...
2,http://www.novapatgartents.com,RoomSpider - Room,BOOK YOUR STAY NOW thewhitewonder 65 posts 1 6...
3,https://www.nordlybornholm.dk/rooms---rates,RoomSpider - Room,Home Rooms amp Rates Schools Activities About ...
4,https://www.salt-kisses.com/vacation-rentals-h...,RoomSpider - Room,times You do not have Javascript enabled in yo...


In [3]:
df_news.shape

(256, 3)

##### 查看缺失值

In [4]:
pd.isnull(df_news).values.any()

True

#### 删除缺失值

In [5]:
df_news = df_news.dropna(axis=0,how='any')
df_news.shape

(252, 3)

In [6]:
pd.isnull(df_news).values.any()

False

#### 查看数据统计信息

In [7]:
df_news.describe()

Unnamed: 0,url,theme,content
count,252,252,252
unique,252,1,251
top,https://www.aohostels.com/en/hotel/,RoomSpider - Room,ROOMS
freq,1,252,2


#### 查看数据占用大小信息

In [8]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 252 entries, 0 to 255
Data columns (total 3 columns):
url        252 non-null object
theme      252 non-null object
content    252 non-null object
dtypes: object(3)
memory usage: 7.9+ KB


In [9]:
# 将内容转换成 list
content = df_news.content.values.tolist()
# print(content[2])

In [10]:
vectorizer = CountVectorizer()
content_S = []
# content_all_hotel = ' '.join(content)
# print(content_all_hotel)
for line in content:
    vectorizer.fit_transform([str(line)]).todense()
    # 获取词袋模型中的所有词语
    current_segment = vectorizer.get_feature_names()
    # current_segment = jieba.lcut(line)
    if len(current_segment) > 1 and current_segment != '\r\n': #换行符
        content_S.append(current_segment)


In [11]:
# content_S[2]
df_content=pd.DataFrame({'content_S':content_S})
df_content.head()

Unnamed: 0,content_S
0,"[00, 02, 101, 1148, 116, 119, 12, 17, 18, 180,..."
1,"[000, 123, 14, 200, 2019, 24, 26734, 50299, 80..."
2,"[10, 12, 1213, 13, 20, 2019, 26, 35, 370510, 5..."
3,"[about, activities, all, amp, and, beautiful, ..."
4,"[00, 01, 05, 06, 102, 103, 104, 109, 110, 125,..."


In [12]:
stopwords = pd.read_csv("stopwords.txt",index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8')
stopwords = stopwords.stopword.values.tolist()
contents = df_content.content_S.values.tolist()
print(contents)
# stopwords.head(20)

[['00', '02', '101', '1148', '116', '119', '12', '17', '18', '180', '1854', '187', '1975', '2014', '2019', '537', '89', '919', '92', '94', 'about', 'ac', 'airport15', 'all', 'amenities', 'and', 'any', 'app', 'apple', 'applied', 'are', 'bed', 'belvena', 'besar', 'besbswy', 'blog', 'book', 'browser', 'budget', 'bundaran', 'bus', 'cabana0', 'can', 'cancellation', 'capacious', 'careers', 'category', 'check', 'choose', 'coffee', 'coffee0', 'conditions', 'continue', 'coupon', 'details', 'disabled', 'double', 'download', 'enable', 'excellent', 'exciting', 'f19', 'f20', 'f21', 'f76', 'f77', 'f78', 'fan', 'for', 'free', 'fri', 'get', 'good', 'government', 'gt', 'guest', 'guests', 'hatta', 'hi', 'hotel', 'hotels', 'hurry', 'id', 'in', 'in17', 'incl', 'indonesia', 'intercom', 'international', 'is', 'issued', 'istana', 'istiqlal', 'it', 'jakarta', 'jakarta4', 'javascript', 'juanda', 'kalisma', 'kit', 'kms', 'know', 'kota', 'left', 'limited', 'login', 'lower', 'me', 'merdeka', 'merdeka0', 'metro', 

In [13]:
def drop_stopwords(contents,stopwords):
    contents_clean = []
    all_words = []
    for line in contents:
        line_clean = []
        for word in line:
            if word in stopwords:
                continue
            line_clean.append(word)
            all_words.append(str(word))
        contents_clean.append(line_clean)
    return contents_clean,all_words
    #print (contents_clean)
contents_clean,all_words = drop_stopwords(contents,stopwords)
print(len(contents_clean))
print(len(all_words))
# print(contents_clean,all_words)

#df_content.content_S.isin(stopwords.stopword)
#df_content=df_content[~df_content.content_S.isin(stopwords.stopword)]
#df_content.head()


250
54777


In [15]:
content_all_hotel = ' '.join(all_words)
print(content_all_hotel)
actual_matrix_result = vectorizer.fit_transform([content_all_hotel]).todense()
# 该类会统计每个词语的tf-idf权值
transformer = TfidfTransformer()
# 第一个fit_transform是计算tf-idf，第二个fit_transform是将文本转为词频矩阵
tfidf = transformer.fit_transform(actual_matrix_result)
# 将tf-idf矩阵抽取出来，元素a[i][j]表示j词在i类文本中的tf-idf权重
# 获取词袋模型中的所有词语
word = vectorizer.get_feature_names()
weight = tfidf.toarray()

00 101 1148 116 119 180 1854 187 1975 2019 537 919 ac airport15 amenities app apple applied bed belvena besar besbswy blog book browser budget bundaran bus cabana0 cancellation capacious careers category check choose coffee coffee0 conditions continue coupon details disabled double download enable excellent exciting f19 f20 f21 f76 f77 f78 fan free fri good government gt guest guests hatta hotel hotels hurry id in17 incl indonesia intercom international issued istana istiqlal jakarta jakarta4 javascript juanda kalisma kit kms kota left limited login lower merdeka merdeka0 metro millenium mirah modern mokka monas monumen museum nasional nearby night oct oct1 offers official offinclusive ongoogle oravel out18 oyo oyos padang partner pastiadaoyo50 photos platinum play pmcheck pmcouples policies policy policy2018 press previous price privacy private proof property queen railway ratings restaurants restoran reviews room rooms rp200844 rp206268 rp268142 rp281511 rp284345 rp293674 rp33000011 

In [16]:
map = {}
for j in range(len(word)):
    temp = 0
    for i in range(len(weight)):
        temp += weight[i][j]
    map.setdefault(word[j], temp)

list = sorted(map.items(), key=lambda item: item[1], reverse=True)
keys = [key for key, value in list]

In [17]:
print(keys)

['room', 'rooms', 'bed', 'book', 'amp', 'hotel', 'tv', 'view', 'shower', 'double', 'suite', 'amenities', 'contact', 'king', 'check', '2019', 'bathroom', 'home', 'free', 'stay', 'beds', 'guests', 'area', 'private', 'experience', 'offers', 'guest', 'coffee', 'service', 'wi', 'close', 'fi', 'queen', 'children', 'air', 'spacious', 'details', 'night', 'policy', 'living', 'suites', 'adults', 'family', 'access', 'dining', 'bedroom', 'features', 'size', 'city', 'enjoy', 'rates', 'deluxe', 'special', 'privacy', 'website', 'floor', 'full', 'breakfast', 'cookies', 'facilities', 'information', 'bar', 'comfortable', 'internet', 'location', 'space', 'availability', 'gallery', 'bath', 'reservation', 'services', 'offer', 'time', 'day', 'make', 'previous', 'single', 'site', 'desk', 'tea', 'top', 'large', 'comfort', 'content', 'reservations', 'safe', 'located', 'parking', 'sofa', 'conditioning', 'email', 'standard', 'balcony', 'events', 'flat', 'high', 'local', 'main', 'october', 'provide', 'screen', 't

### ROOMCLASS 词频提取

In [None]:
ROOMCLASS = [
    'Club Superior'
    , 'Signature Executive'
    , 'Disability Access'
    , 'Disable Friendly'
    , 'Deluxe Executie'
    , 'Deluxe Executive'
    , 'The Beatles'
    , 'Prima Donna'
    , 'Superior Deluxe'
    , 'Space Needle'
    , 'ultra-Luxury'
    , 'Snow Dove'
    , 'Exclusive'
    , 'Executive'
    , 'Deluxe'
    , 'Gallery'
    , 'Grand'
    , 'Junior'
    , 'Luxury'
    , 'Superior'
    , 'Senior'
    , 'Signature'
    , 'Superia'
    , 'Classic'
    , 'Premium'
    , 'Premier'
    , 'Presidential'
    , 'Lux'
    , 'Economical'
    , 'Economy'
    , 'Adjoining'
    , 'Bridal'
    , 'Prima'
    , 'Aloft'
    , 'Maestro'
    , 'Juliet'
    , 'Basic'
    , 'Cascade'
    , 'Classico'
    , 'Club'
    , 'Comfort'
    , 'Cozy'
    , 'Cabin'
    , 'Design'
    , 'Denny'
    , 'Ensemble'
    , 'Elite'
    , 'Family'
    , 'Families'
    , 'Friendly'
    , 'Guest'
    , 'Graduate'
    , 'Grizzly'
    , 'Goldfinch'
    , 'Honeymoon'
    , 'Harbour'
    , 'Hospitality'
    , 'Hummingbird'
    , 'Mini'
    , 'Lummi'
    , 'Majestic'
    , 'Maestro'
    , 'Master'
    , 'Meadowlark'
    , 'Minnie'
    , 'Monarch'
    , 'Northwoods'
    , 'Panoramic'
    , 'Prince'
    , 'Romantic'
    , 'Royal'
    , 'Shared'
    , 'Sorrento'
    , 'Starling'
    , 'Sandpiper'
    , 'Traditional'
    , 'Urban'
    , 'Business'
    , 'Standard'
    , 'Private'
    , 'Entire'
    , 'Non-view'
    , 'Spa'
             ]