**Load Library**

In [2]:
import os
import spacy
import pandas as pd
import numpy as np
import geopandas as gpd
import re
import math
import string
import unicodedata
import gensim
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
import contextily as ctx
import urllib.request
import ast  # 用于安全地将字符串转换为列表

from scipy.spatial.distance import cdist

from shapely.geometry import Point

from sklearn.preprocessing import OneHotEncoder  # We don't use this but I point out where you *could*
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC



from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk import ngrams, FreqDist

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.matutils import Sparse2Corpus
from gensim.matutils import corpus2dense
from gensim.models import tfidfmodel
from gensim.models import Word2Vec
from gensim.models import TfidfModel
from gensim.models import KeyedVectors
from gensim.models.ldamodel import LdaModel

from bs4 import BeautifulSoup
from wordcloud import WordCloud, STOPWORDS

from joblib import dump
from joblib import load

nltk.download('stopwords')
stopword_list = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SBH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
Airbnb_Listing = pd.read_csv(os.path.join("..","Data","Airbnb_Listing_norm.csv"))

In [4]:
Airbnb_Listing['amenities_norm'].head(10)

0    . heating  standard cable  wifi  smoke alarm  ...
1    . window guard  bathtub  water kettle  laundro...
2    . bathtub  water kettle  laundromat nearby  pr...
3    . shampoo  luggage dropoff allow  dryer  micro...
4    . window guard  bathtub  water kettle  laundro...
5    . single level home  bathtub  water kettle  la...
6    . bathtub  water kettle  laundromat nearby  pa...
7    . bathtub  free dryer . unit  water kettle  la...
8    . shampoo  luggage dropoff allow  microwave  c...
9    . heating  hair dryer  iron  washer  lock bedr...
Name: amenities_norm, dtype: object

In [5]:
texts_word2vec = Airbnb_Listing['amenities_norm']

In [6]:
# 从norm并且split后的数据读取csv
amenities_norm_split = pd.read_csv(os.path.join("..","Data","amenities_norm_split.csv"))

# 将 'amenities' 列中的字符串转换为列表
# 使用 ast.literal_eval 安全地评估字符串表达的列表
amenities_ast_literal = amenities_norm_split
amenities_ast_literal.drop('Unnamed: 0',axis=1)

  amenities_norm_split = pd.read_csv(os.path.join("..","Data","amenities_norm_split.csv"))


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,208,209,210,211,212,213,214,215,216,217
0,heating,standard,cable,wifi,smoke,alarm,dryer,kitchen,washer,essentials,...,,,,,,,,,,
1,window,guard,bathtub,water,kettle,laundromat,nearby,indoor,fireplace,microwave,...,,,,,,,,,,
2,bathtub,water,kettle,laundromat,nearby,private,patio,balcony,paid,street,...,,,,,,,,,,
3,shampoo,luggage,dropoff,allow,dryer,microwave,coffee,maker,water,iron,...,,,,,,,,,,
4,window,guard,bathtub,water,kettle,laundromat,nearby,free,driveway,park,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87941,water,kettle,laundromat,nearby,shampoo,luggage,dropoff,allow,dryer,coffee,...,,,,,,,,,,
87942,bathtub,free,dryer,unit,water,kettle,private,patio,balcony,fire,...,,,,,,,,,,
87943,window,guard,bathtub,water,kettle,laundromat,nearby,paid,street,park,...,,,,,,,,,,
87944,water,kettle,dryer,microwave,coffee,maker,linens,iron,patio,balcony,...,,,,,,,,,,


In [7]:
list_of_lists = amenities_ast_literal.apply(lambda row: [item for item in row if item is not None], axis=1).tolist()



In [8]:
print(len(list_of_lists))

87946


In [10]:

# 准备用于 Word2Vec 的数据


# 指定训练参数
dims = 500
window = 20

# 训练 Word2Vec 模型
model = Word2Vec(sentences=list_of_lists, vector_size=dims, window=window, min_count=3, workers=4)



In [18]:
# 保存word2vec模型
model.save(os.path.join("..","Model",f"word2vec-d{dims}-w{window}.model"))

读取数据源

In [12]:
Airbnb_Listing_origin = pd.read_csv(os.path.join("..","Data","Data_InsideAirbnb","listings.csv.gz"))

  Airbnb_Listing_origin = pd.read_csv(os.path.join("..","Data","Data_InsideAirbnb","listings.csv.gz"))


计算所有listing的average incom


In [13]:
# 每个listing的收入与average收入相比
if Airbnb_Listing_origin['price'].dtype == 'object':
    Airbnb_Listing_origin['price'] = Airbnb_Listing_origin['price'].str.replace('$', '').str.replace(',', '').astype(float)
Airbnb_Listing['sum_income'] = Airbnb_Listing_origin['minimum_nights']*2.7*Airbnb_Listing_origin['number_of_reviews_ltm']*Airbnb_Listing_origin['price']

average_income_forlisting = Airbnb_Listing['sum_income'].mean()
average_income_forlisting

7194.986408705343

In [14]:
Airbnb_Listing['profitable'] = (Airbnb_Listing['sum_income'] >= average_income_forlisting).astype(int)

In [15]:
amenities_norm_split_doc = amenities_norm_split.apply(lambda row: row.tolist(), axis=1)


In [16]:
# 转换文本向量
def document_vector(word2vec_model, doc):
    # 移除不在词汇表中的词
    doc = [word for word in doc if word in word2vec_model.wv.key_to_index]
    # 处理空文档的情况
    if len(doc) == 0:
        return np.zeros(word2vec_model.vector_size)
    # 计算均值向量
    return np.mean(word2vec_model.wv[doc], axis=0)

# 为每个文档计算向量
doc_vectors = np.array([document_vector(model, doc) for doc in amenities_norm_split_doc])

分词处理：您使用的是 text.split(" ") 来分词。这意味着您假设文本中的每个单词之间由两个空格分隔。请确保这与您的数据格式一致。如果是普通英文文本，通常单词之间只有一个空格，那么应该使用 text.split()。

空文档处理：在 document_vector 函数中，如果文档中所有的词都不在模型的词汇表中，那么 word2vec_model.wv[doc] 将是一个空列表，这会导致 np.mean 报错。您需要处理这种情况。

文档向量计算：当您计算文档向量时，您使用的是 np.array([document_vector(model, doc) for doc in texts])。这里 texts 应该是分词后的文本数据。请确保 texts 和 texts_word2vec 是一致的，即 texts 应该是用于训练 Word2Vec 模型的相同数据。

标签和特征数据：确保 labels 是与 doc_vectors 对应的目标变量数组。labels 应该有与 doc_vectors 相同数量的元素。

模型性能评估：在最后，您计算了准确率，这是评估分类模型性能的一个常用指标。根据您的应用情况，可能还需要考虑其他指标，如精确率、召回率和F1分数。

异常和错误处理：在实际应用中，建议添加异常处理和错误检查，确保代码的健壮性。

In [8]:
# 随机森林方法

#使用任何类型的分类器来预测是否income超过平均值
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(doc_vectors, Airbnb_Listing['profitable'], test_size=0.2, random_state=42)

# 训练分类器
classifier = RandomForestClassifier(n_estimators=300, random_state=42)
classifier.fit(X_train, y_train)

# 预测测试集
y_pred = classifier.predict(X_test)

# 评估模型
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.8054007959067652


In [17]:
#SVM方法
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(doc_vectors, Airbnb_Listing['profitable'], test_size=0.2, random_state=42)

# 创建 SVM 分类器实例
svm_classifier = SVC(random_state=42)

# 训练分类器
svm_classifier.fit(X_train, y_train)

# 预测测试集
y_pred = svm_classifier.predict(X_test)

# 评估模型
print("Accuracy:", accuracy_score(y_test, y_pred))



Accuracy: 0.7957362137578169


In [15]:
# 设置要测试的参数
param_grid = {
    'C': [0.1, 1, 10, 100], 
    'gamma': ['scale', 'auto'], 
    'kernel': ['linear', 'rbf', 'poly']
}

# 创建带有参数网格的 GridSearchCV 对象
grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5, scoring='accuracy')

# 训练网格搜索模型
grid_search.fit(X_train, y_train)

# 找到最优参数
print("Best parameters:", grid_search.best_params_)

# 使用最优参数的模型对测试集进行预测
y_pred = grid_search.predict(X_test)

# 评估模型
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


保存SVM模型与保存后的Airbnb_Listing_norm_income

In [11]:
dump(classifier, 'randforest_classfier_model.joblib')
dump(svm_classifier, 'svm_classfier_model.joblib')

['svm_classfier_model.joblib']

词向量可视化

In [None]:
word2vec_model = Word2Vec.load(os.path.join("Model","word2vec-d400-w10.model"))

In [None]:
amenities_norm_split = Airbnb_Listing['amenities_norm']
amenities_norm_split = list(amenities_norm_split)
amenities_norm_split

In [None]:
def preprocess_and_tokenize(list_of_strings):
    processed_lists = []
    for string in list_of_strings:
        # 删除所有的 '.' 符号
        string = string.replace('.', '')
        # 替换所有连续的两个空格为一个空格
        string = string.replace('  ', ' ')
        # 分词
        tokens = string.split()
        # 将处理后的列表添加到结果中
        processed_lists.append(tokens)
    return processed_lists

In [None]:
amenities_norm_split = preprocess_and_tokenize(amenities_norm_split)

amenities_norm_split = pd.DataFrame(amenities_norm_split)

amenities_norm_split.to_csv(os.path.join("Data","amenities_norm_split.csv"))

将训练后model应用于'amenities'列

In [None]:

"""# 预处理函数
def preprocess(text):
    # 这里添加文本清洗逻辑（例如：转换为小写，去除标点等）
    return text.lower()"""

# 向量化函数
def vectorize(text, model):
    # 将文本分解为单词，并过滤掉模型词汇表中不存在的单词
    words = [word for word in text if word in model.wv.key_to_index]
    # 如果文本中没有模型已知的单词，则返回零向量
    if len(words) == 0:
        return np.zeros(model.vector_size)
    # 计算所有单词向量的平均值
    word_vectors = [model.wv[word] for word in words]
    return np.mean(word_vectors, axis=0)

