#### 初始化

In [225]:
import pandas as pd

titles = pd.DataFrame(columns = ['content'])

titles = pd.DataFrame({'content':["c1: Human machine interface for Lab ABC computer applications",
                                  "c2: A survey of user opinion of computer system response time",
                                  "c3: The EPS user interface management system",
                                  "c4: System and human system engineering testing of EPS",
                                  "c5: Relation of user-perceived response time to error measurement",
                                  "m1: The generation of random, binary, unordered trees",
                                  "m2: The intersection graph of paths in trees",
                                  "m3: Graph minors IV: Widths of trees and well-quasi-ordering",
                                  "m4: Graph minors: A survey"
                                 ]})

def getIndex(text):
    return text.split(":")[0]

def getContent(text):
    return text[text.index(":")+1:]

titles['index'] = titles['content'].apply(getIndex)
titles['content'] = titles['content'].apply(getContent)

titles

Unnamed: 0,content,index
0,Human machine interface for Lab ABC computer ...,c1
1,A survey of user opinion of computer system r...,c2
2,The EPS user interface management system,c3
3,System and human system engineering testing o...,c4
4,Relation of user-perceived response time to e...,c5
5,"The generation of random, binary, unordered t...",m1
6,The intersection graph of paths in trees,m2
7,Graph minors IV: Widths of trees and well-qua...,m3
8,Graph minors: A survey,m4


In [226]:
import nltk
# import pandas as pd
# import numpy as np
import re
# import spacy
# from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk import PorterStemmer
from nltk.corpus import stopwords 
# nltk.set_proxy('http://10.144.1.10:8080')
# nltk.download('stopwords')
# nltk.download('punkt')

def default_clean(text):
    '''
    Removes default bad characters
    '''
    if not (pd.isnull(text)):
    # text = filter(lambda x: x in string.printable, text)
        bad_chars = set(["@", '-', '|', '<', '>', "+", '/', "'", '"', '\\','(',')', '\\n', '?', '#', ',','.', '[',']', '%', '$', '&', ';', '!', ';', ':',"*", "_", "=", "}", "{"])
        for char in bad_chars:
            text = text.replace(char, " ")
        text = re.sub('\d+', "", text)
#         print (text)
    return text
 
def stop_and_stem(text, stem=True, stemmer = PorterStemmer()):
    '''
    Removes stopwords and does stemming
    '''
#     print (text)
    stoplist = stopwords.words('english')
    if stem:
        text_stemmed = [stemmer.stem(word.lower()) for word in word_tokenize(text) if word.lower() not in stoplist]
    else:
        text_stemmed = [word.lower() for word in word_tokenize(text) if word.lower() not in stoplist]
    text = ' '.join(text_stemmed)
#     print (text)
    return text

In [227]:
titles['tmp'] = titles['content'].apply(default_clean)

In [228]:
titles['tmp'] = titles['tmp'].apply(stop_and_stem, stem=False)

In [229]:
print(titles)

                                             content index  \
0   Human machine interface for Lab ABC computer ...    c1   
1   A survey of user opinion of computer system r...    c2   
2           The EPS user interface management system    c3   
3   System and human system engineering testing o...    c4   
4   Relation of user-perceived response time to e...    c5   
5   The generation of random, binary, unordered t...    m1   
6           The intersection graph of paths in trees    m2   
7   Graph minors IV: Widths of trees and well-qua...    m3   
8                             Graph minors: A survey    m4   

                                                 tmp  
0  human machine interface lab abc computer appli...  
1  survey user opinion computer system response time  
2               eps user interface management system  
3        system human system engineering testing eps  
4  relation user perceived response time error me...  
5           generation random binary unordered tr

In [230]:
#词干化 ignore
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
st.stem('stemmed')     #=>'stem'
st.stem('stemming')    #=>'stem

'stem'

In [231]:
# 拆分单词
def tokenize(sentences):
    for sentence in sentences:
        yield (nltk.word_tokenize(sentence))
data = list(titles['tmp'])        
sentences = list(tokenize(data))

In [232]:
list(sentences)

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

In [233]:
from scipy.sparse.csr import csr_matrix
 
docs = [["hello", "world", "hello"], ["goodbye", "cruel", "world"]]
# docs = sentences
indptr = [0]        # 存放的是行偏移量
indices = []        # 存放的是data中元素对应的列编号（列编号可重复）
data = []           # 存放的是非0数据元素
vocabulary = {}     # key是word词汇，value是列编号
for d in docs:      # 遍历每个文档
    for term in d:  # 遍历文档的每个词汇term
        # setdefault如果term不存在，则将新term和他的列
        # 编号len(vocabulary)加入到词典中，返回他的编号；
        # 如果term存在，则不填加，返回已存在的编号
        index = vocabulary.setdefault(term, len(vocabulary))
        indices.append(index)
        data.append(1)
    indptr.append(len(indices))
# csr_matrix可以将同一个词汇次数求和
csr_matrix((data, indices, indptr), dtype=int).toarray()

array([[2, 1, 0, 0],
       [0, 1, 1, 1]])

In [234]:
print (vocabulary)
print(indptr)
print(indices)
print(data)

{'hello': 0, 'world': 1, 'goodbye': 2, 'cruel': 3}
[0, 3, 6]
[0, 1, 0, 2, 3, 1]
[1, 1, 1, 1, 1, 1]


#### 构造词汇-文档矩阵 

In [235]:
#!/usr/bin/python

from numpy import zeros
from scipy.linalg import svd
from math import log    # needed for TFIDF
from numpy import asarray, sum


class LSA(object):
    def __init__(self):
        self.wdict = {}
        self.dcount = 0

    def parse(self, docs):
        for d in docs:
            for w in d:
                self.wdict.setdefault(w,[]).append(self.dcount)
#                 if w in self.wdict:
#                     self.wdict[w].append(self.dcount)
#                 else:
#                     self.wdict[w] = [self.dcount]
            self.dcount += 1
    
    
    def build(self):
        # rows -> keywords (occur more than twice), cols -> documentID
        # self.keys = [k for k in self.wdict.keys() if len(self.wdict[k]) > 1]
        self.keys = [k for k in self.wdict.keys()]
        self.keys.sort()
        self.A = zeros([len(self.keys), self.dcount])
        for i, k in enumerate(self.keys):
            for d in self.wdict[k]:
                self.A[i,d] += 1
    
    def calc(self):
        #u大小为(M,M)，s大小为(M,N)，v大小为(N,N)
        #sigma为了节约空间，只返回对角线非0值
        self.U, self.S, self.Vt = svd(self.A)
    
    def TFIDF(self):
        WordsPerDoc = sum(self.A, axis=0)
        print("words")
        print(WordsPerDoc)
        DocsPerWord = sum(asarray(self.A > 0, 'i'), axis=1)
        print("docs")
        print(DocsPerWord)
        rows, cols = self.A.shape
        for i in range(rows):
            for j in range(cols):
                self.A[i,j] = (self.A[i,j] / WordsPerDoc[j]) * log(float(cols) / (DocsPerWord[i]+1))
    
    def printA(self):
        print ('Here is the count matrix')
        print (self.A)
    
    def printSVD(self):
        print ('Here are the singular values')
        print (self.S)
        print ('Here are the first 3 columns of the U matrix')
        print (-1*self.U[:, 0:3])
        print ('Here are the first 3 rows of the Vt matrix')
        print (-1*self.Vt[0:3, :])

#A=U*S*Vt
#A 词*文档
#U 词*语义
#S 语义*主题
#Vt 主题*文档

In [236]:
mylsa =  LSA()

print(sentences)
mylsa.parse(sentences)
print(mylsa.wdict)
print(mylsa.dcount)
mylsa.build()
print(mylsa.keys)
mylsa.printA()
mylsa.TFIDF()
mylsa.printA()
mylsa.calc()
mylsa.printSVD()
print("A ", mylsa.A.shape)
print("U ", mylsa.U.shape)
print("S ", mylsa.S.shape)
print("Vt ", mylsa.Vt.shape)

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'], ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'management', 'system'], ['system', 'human', 'system', 'engineering', 'testing', 'eps'], ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'], ['generation', 'random', 'binary', 'unordered', 'trees'], ['intersection', 'graph', 'paths', 'trees'], ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'], ['graph', 'minors', 'survey']]
{'human': [0, 3], 'machine': [0], 'interface': [0, 2], 'lab': [0], 'abc': [0], 'computer': [0, 1], 'applications': [0], 'survey': [1, 8], 'user': [1, 2, 4], 'opinion': [1], 'system': [1, 2, 3, 3], 'response': [1, 4], 'time': [1, 4], 'eps': [2, 3], 'management': [2], 'engineering': [3], 'testing': [3], 'relation': [4], 'perceived': [4], 'error': [4], 'measurement': [4], 'generation': [5], 'random': [5], 'binary': [5], 'unordered': [5], 'tre

In [237]:
print(mylsa.S.shape[0])
#选取 K， 使得特征值保留 > 90%
def selectK(S):
    maxNum=S.shape[0]
    totalValue=sum(S)
    print(S)
    for k in range(1,maxNum+1):
        if(sum(S[0:k])/totalValue > 0.9):
            return k

K = selectK(mylsa.S)
K

9
[0.67597555 0.62425526 0.61096102 0.5539611  0.51722333 0.48935126
 0.42746009 0.4055805  0.33556226]


8

In [238]:
print(mylsa.U.shape)
Ureduce = mylsa.U[:,0:K]
print(Ureduce.shape)

(35, 35)
(35, 8)


#### 打印所有word-潜在的K个特征

In [239]:
print(mylsa.keys)

def printRow(text):
     print('%-10.7s' % text, end="")
#      print('%.1f' % text, end="    ")
    
printRow("")
for i in range(Ureduce.shape[1]):
    printRow(f'latent{i}')
print()

for row in range(Ureduce.shape[0]):
    printRow(mylsa.keys[row])
    for col in range(Ureduce.shape[1]):
        printRow(Ureduce[row,col])
    print()

['abc', 'applications', 'binary', 'computer', 'engineering', 'eps', 'error', 'generation', 'graph', 'human', 'interface', 'intersection', 'iv', 'lab', 'machine', 'management', 'measurement', 'minors', 'opinion', 'ordering', 'paths', 'perceived', 'quasi', 'random', 'relation', 'response', 'survey', 'system', 'testing', 'time', 'trees', 'unordered', 'user', 'well', 'widths']
          latent0   latent1   latent2   latent3   latent4   latent5   latent6   latent7   
abc       -0.0111   0.07028   -0.1085   0.03915   -0.0810   0.39132   0.02012   -0.0475   
applica   -0.0111   0.07028   -0.1085   0.03915   -0.0810   0.39132   0.02012   -0.0475   
binary    -0.1669   -0.3491   -0.2510   -0.1436   -0.0170   -0.0003   -0.0393   0.00263   
compute   -0.0415   0.11720   -0.1266   -0.0261   0.02752   0.30325   -0.0683   -0.0470   
enginee   -0.0196   0.13172   -0.2076   0.08283   -0.1220   -0.2045   -0.0112   -0.3745   
eps       -0.0322   0.20744   -0.3217   0.11085   -0.1082   -0.2391   0.03780 

#### 打印所有文章 在K维（基于隐义topic）D_kxd

In [240]:
D_kxd = mylsa.Vt[:K,:]

printRow("")
for i in range(titles['index'].size):
    printRow(titles['index'][i])
print()

for row in range(K):
    printRow(f'topic{row}')
    for col in range(D_kxd.shape[1]):
        printRow(D_kxd[row,col])
    print()
#后面查询就跟 D_kxd进行相似度计算

          c1        c2        c3        c4        c5        m1        m2        m3        m4        
topic0    -0.0351   -0.1437   -0.0551   -0.0530   -0.0479   -0.3750   -0.6235   -0.3196   -0.5815   
topic1    0.20420   0.26197   0.31601   0.32802   0.15708   -0.7246   -0.0675   0.07212   0.35021   
topic2    -0.3085   -0.1845   -0.4728   -0.5061   -0.1624   -0.5099   0.17526   0.11443   0.24663   
topic3    0.10093   -0.1933   0.12694   0.18306   -0.1635   -0.2644   0.72883   -0.1483   -0.5028   
topic4    -0.1950   0.28573   -0.0449   -0.2517   0.84645   -0.0293   0.18611   -0.1567   -0.1958   
topic5    0.89122   0.05431   -0.1997   -0.3992   0.04425   -0.0006   0.01750   -0.0291   -0.0178   
topic6    0.04004   -0.2262   0.08947   -0.0191   0.19343   -0.0559   -0.0936   0.88708   -0.3202   
topic7    -0.0898   -0.0318   0.77281   -0.6060   -0.1511   0.00355   0.00092   -0.0502   0.03213   


#### 查询

In [241]:
import numpy as np

#根据输入 构造1*topic 的 Dq = 1*K = Xq(1*wordnum) *  Ureduce（wordnum*k） *  S-1（K*K 的逆矩阵）
query="a system, user"
Xq = zeros([1, len(mylsa.keys)])

text=default_clean(query)
text=stop_and_stem(text, False)
qwords = nltk.word_tokenize(text)
print(qwords)

for i in range(len(mylsa.keys)):
    Xq[0,i] += qwords.count(mylsa.keys[i])

# print(Dq.transpose())
#  np.linalg.inv

# S=mylsa.S[0:K]
# print(S)

# rebuild Sigma
S=zeros([K, K]);
for i in range(K):
    S[i, i]=mylsa.S[i]

Dq = np.dot(np.dot(Xq, Ureduce), np.linalg.inv(S))

print(f'Dq = {Dq}')

['system', 'user']
Dq = [[-0.15556651  0.69303423 -0.94230915  0.08771285  0.30516352 -0.64736982
  -0.03370896  0.37675818]]


In [242]:
nums=cosine_similarity(Dq, D_kxd)
print(nums)

[-0.15774672  0.52233922  0.76282757  0.47834595  0.31702675  0.0056792
 -0.00145672 -0.07051911  0.02284328]


In [243]:
highIndexs = nums.argsort()[-3:][::-1]

print("your search: " + query)

# print(highIndexs)
for i in highIndexs:
    printRow(titles["index"][i])
    print(titles["content"][i])

your search: a system, user
c3         The EPS user interface management system
c2         A survey of user opinion of computer system response time
c4         System and human system engineering testing of EPS
