# similarity analysis

In [None]:
import jieba.posseg as pseg
import codecs
from gensim import corpora, models, similarities

### 构建停用词表

In [None]:
stop_words = '/Users/yiiyuanliu/Desktop/nlp/demo/stop_words.txt'
stopwords = codecs.open(stop_words,'r',encoding='utf8').readlines()
stopwords = [ w.strip() for w in stopwords ]

### 结巴分词后的停用词性 [标点符号、连词、助词、副词、介词、时语素、‘的’、数词、方位词、代词]

In [None]:
stop_flag = ['x', 'c', 'u','d', 'p', 't', 'uj', 'm', 'f', 'r']

### 对一篇文章分词、去停用词

In [None]:
def tokenization(filename):
    result = []
    with open(filename, 'r') as f:
        text = f.read()
        words = pseg.cut(text)
    for word, flag in words:
        if flag not in stop_flag and word not in stopwords:
            result.append(word)
    return result

### 选取文章

In [None]:
filenames = ['/Users/yiiyuanliu/Desktop/nlp/demo/articles/13 件小事帮您稳血压.txt', 
             '/Users/yiiyuanliu/Desktop/nlp/demo/articles/高血压患者宜喝低脂奶.txt',
             '/Users/yiiyuanliu/Desktop/nlp/demo/articles/ios.txt'
            ]
corpus = []
for each in filenames:
    corpus.append(tokenization(each))
print len(corpus)

In [None]:
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/1q/5404x10d3k76q2wqys68pzkh0000gn/T/jieba.cache
Loading model cost 0.349 seconds.
Prefix dict has been built succesfully.

### 建立词袋模型

In [None]:
dictionary = corpora.Dictionary(corpus)
print dictionary

In [None]:
Dictionary(431 unique tokens: [u'\u627e\u51fa', u'\u804c\u4f4d', u'\u6253\u9f3e', u'\u4eba\u7fa4', u'\u996e\u54c1']...)

In [None]:
doc_vectors = [dictionary.doc2bow(text) for text in corpus]
print len(doc_vectors)
print doc_vectors

### model1_建立TF-IDF模型

In [None]:
tfidf = models.TfidfModel(doc_vectors)
tfidf_vectors = tfidf[doc_vectors]

In [None]:
print len(tfidf_vectors)
print len(tfidf_vectors[0])

### 构建一个query文本，是高血压主题的，利用词袋模型的字典将其映射到向量空间

In [None]:
query = tokenization('/Users/yiiyuanliu/Desktop/nlp/demo/articles/关于降压药的五个问题.txt')

In [None]:
query_bow = dictionary.doc2bow(query)

In [None]:
print len(query_bow)
print query_bow

In [None]:
index = similarities.MatrixSimilarity(tfidf_vectors)

### 用TF-IDF模型计算相似度

In [None]:
sims = index[query_bow]
print list(enumerate(sims))

### model2_构建LSI模型，设置主题数为2

In [None]:
lsi = models.LsiModel(tfidf_vectors, id2word=dictionary, num_topics=2)

In [None]:
lsi.print_topics(2)

In [None]:
lsi_vector = lsi[tfidf_vectors]
for vec in lsi_vector:
    print vec

### 在LSI向量空间中，所有文本的向量都是二维的

In [None]:
query = tokenization('/Users/yiiyuanliu/Desktop/nlp/demo/articles/关于降压药的五个问题.txt')
query_bow = dictionary.doc2bow(query)
print query_bow

In [None]:
query_lsi = lsi[query_bow]
print query_lsi

In [None]:
index = similarities.MatrixSimilarity(lsi_vector)
sims = index[query_lsi]
print list(enumerate(sims))