**Loading Library**

In [None]:
import os
import spacy
import pandas as pd
import numpy as np
import geopandas as gpd
import re
import math
import string
import unicodedata
import gensim
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
from joblib import dump
from joblib import load
import contextily as ctx
import urllib.request

from scipy.spatial.distance import cdist

from shapely.geometry import Point

from sklearn.preprocessing import OneHotEncoder  # We don't use this but I point out where you *could*
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC


from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk import ngrams, FreqDist

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.matutils import Sparse2Corpus
from gensim.matutils import corpus2dense
from gensim.models import tfidfmodel
from gensim.models import Word2Vec
from gensim.models import TfidfModel
from gensim.models import KeyedVectors
from gensim.models.ldamodel import LdaModel

from bs4 import BeautifulSoup
from wordcloud import WordCloud, STOPWORDS

nltk.download('stopwords')
stopword_list = set(stopwords.words('english'))

# Import everthing from textual/__init__.py
# Including bunch of tools and functions we could use for NLP 
from textual import *

**Latent Dirchlet Allocation**

使用TF-IDF模型进行向量化

In [None]:
# Fit and Transform
tfvectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,4), 
                               max_df=0.3, min_df=0.005,stop_words='english') # modify these parameter to improve the model
tfidf_corpus = tfvectorizer.fit_transform(corpus) # TF-transformed corpus

# Transformed corpus
# 将整个TF-IDF转换后的语料库转换为DataFrame格式 
tfidf_dataframe = pd.DataFrame(data=tfidf_corpus.toarray(),
                        columns=tfvectorizer.get_feature_names_out())#将稀疏矩阵转换为密集矩阵，并使用相同的特征名称作为列名。
print(f"TF/IDF data frame has {tfidf_dataframe.shape[0]:,} rows and {tfidf_dataframe.shape[1]:,} columns.")#打印DataFrame的行数和列数，即文档数和特征数。


构建和训练LDA模型

接着，构建LDA模型并用预处理后的数据进行训练。

In [None]:
# 确定n_components参数


# 准备Gensim所需的数据结构
texts = [doc.split() for doc in corpus]                 #分词
dictionary = Dictionary(texts)                          #每个词创建唯一值字典
bow_corpus = [dictionary.doc2bow(text) for text in texts]
#将其转换为TFIDF模型
tfidf_model = TfidfModel(bow_corpus)
tfidf_corpus = [tfidf_model[doc] for doc in bow_corpus]

"""
# 您需要将Gensim的TF-IDF语料库转换为scikit-learn可以处理的格式。
# 通常，这涉及到将Gensim生成的稀疏表示转换为一个密集的NumPy数组或者一个稀疏矩阵。
num_terms = len(dictionary)
num_docs = len(tfidf_corpus)
tfidf_sparse = corpus2dense(tfidf_corpus,num_docs=num_docs,num_terms=num_terms).T
"""

# 测试不同的主题数
coherence_values = []

for n_topics in range(1, 41):
    LDA = LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=n_topics, random_state=42,iterations=800)
    CM = CoherenceModel(model=LDA, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_values.append(CM.get_coherence())


# 选择最佳主题数
optimal_topics = np.argmax(coherence_values) + 1  # 加1因为索引从0开始
print(f"Optimal number of topics: {optimal_topics}")


In [None]:
# 保存不同主题的coherence值，
# coherence得分一般在0-1之间，
# 一般超过0.5被认为模型拟合较好


# 创建包含主题数和一致性得分的DataFrame
LDA_topic_coherence_data = {'Topic_Num': range(1, len(coherence_values) + 1), 'Coherence_Score': coherence_values}
LDA_topic_coherence_frame = pd.DataFrame(LDA_topic_coherence_data)
# 保存为CSV文件
LDA_topic_coherence_frame.to_csv("./Data/coherence_values.csv", index=False)



将主题分配给文档

对每个文档（房源描述）分配一个主题。

In [None]:
# 设置LDA模型
LDA = LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=LDA_topic_coherence_frame.loc[LDA_topic_coherence_frame['Coherence_Score'].idxmax(),'Topic_Num'], random_state=42,iterations=800)
# 对每个文档获取主题分布
doc_topics = [LDA.get_document_topics(bow) for bow in tfidf_corpus]

# 获取每个主题的主要词汇
topics = LDA.print_topics(num_words=40)

查看主题和关键词

查看模型找出的主题和每个主题的关键词。并且保存

In [None]:
# 获取每个主题的主题词及其权重
topics = LDA.show_topics(num_topics=16, num_words=20, formatted=False)

# 创建一个list保存每个主题的主题词及权重
LDA_topics_data = []

for topic_num, topic_words in topics:
    for word, weight in topic_words:
        LDA_topics_data.append({'Topic': topic_num, 'Word': word, 'Weight': weight})

# 转换为DataFrame
LDA_topics_frame = pd.DataFrame(LDA_topics_data)

# 保存这个模型生成的主题结果
LDA_topics_frame.to_csv(os.path.join("Data","lda_topics_and_words.csv"), index=False)


将doc_topic转换为dataframe，每个文档（listing）的主题分布及其占比

In [None]:
listing_lda_topic = pd.DataFrame(doc_topics)
#循环遍历每一行
for i in range(len(listing_lda_topic)):
    temp_list = listing_lda_topic.iloc[i].dropna()
    # 创建一个全为0的序列，用于替换当前行
    zero_series = pd.Series([0.0] * len(listing_lda_topic.columns), index=listing_lda_topic.columns)
    listing_lda_topic.iloc[i] = zero_series
    for j in temp_list.index:
        # 从原始序列中获取列名和值
        col = temp_list[j][0] if isinstance(temp_list[j], (list, tuple)) else j
        value = temp_list[j][1] if isinstance(temp_list[j], (list, tuple)) else temp_list[j]
        # 更新DataFrame
        listing_lda_topic.at[i, col] = value


保存lda模型后每个房源的主题分布及占比

In [None]:
listing_lda_topic.to_csv(os.path.join("Data","listing_lda_topic.csv"))