# 初始化 poems & words

In [None]:
import numpy as np
import re
import math

poems = {}
with open('poems.txt', 'r') as f:
    poem = {'content': []}
    for l in f.readlines():
        l = l.split(' ')
        line = re.sub(r'\([^)]*\)', '', ''.join(l[3:]))
        line = line.strip()
        if l[2]=='-100':
            if poem['content']:
                poems[poem['author']] = poems.get(poem['author'], [])
                poems[poem['author']].append(poem)
            poem = {'title': line, 'content': []}
        elif l[2]=='-10':
            assert 'title' in poem
            poem['note'] = line
        elif l[2]=='-1':
            assert 'title' in poem
            poem['author'] = line
        elif int(l[2])>0:
            assert 'title' in poem
            poem['content'].append(line)
            assert len(poem['content']) == int(l[2])
    if poem['content']:
        poems[poem['author']] = poems.get(poem['author'], [])
        poems[poem['author']].append(poem)

poems['all'] = [x for y in poems.values() for x in y]

words = []
with open('wordlist.txt', 'r') as f:
    words = [w.strip() for w in f.readlines()]

# 计算 idf

In [None]:
idf = {}
cnt = {word: 0 for word in words}

for poem in poems['all']:
    s = set()
    for l in poem['content']:
        for i in range(1, 4):
            for j in range(len(l)-i):
                s.add(l[j:j+i])
    for word in s:
        if word in cnt:
            cnt[word] += 1
                
for word in cnt:
    if cnt[word]>10:
        idf[word] = math.log10(len(poems['all'])/(1+cnt[word]))

idf

# 计算 tf-idf 向量

In [None]:
tfidf_w = np.zeros((len(idf), len(poems['all'])), dtype=np.float64)
tfidf_p = np.zeros((len(poems['all']), len(idf)), dtype=np.float64)
idx_w = {}
for idx, word in enumerate(idf):
    idx_w[word] = idx
for idx_p, poem in enumerate(poems['all']):
    for l in poem['content']:
        cnt = 0
        for i in range(1, 4):
            for j in range(len(l)-i):
                word = l[j:j+i]
                if word in idf:
                    cnt += 1
        for i in range(1, 4):
            for j in range(len(l)-i):
                word = l[j:j+i]
                if word in idf:
                    tfidf_w[idx_w[word]][idx_p] += 1/cnt*idf[word]
                    tfidf_p[idx_p][idx_w[word]] += 1/cnt*idf[word]
for v in tfidf_w:
    v /= np.linalg.norm(v)
for v in tfidf_p:
    v /= np.linalg.norm(v)

# 对词向量和诗向量降维（PCA）

In [None]:
from sklearn.decomposition import PCA

model_w = PCA(n_components=500)
model_w.fit(tfidf_w)
tfidf_w = model_w.transform(tfidf_w)

model_p = PCA(n_components=500)
model_p.fit(tfidf_p)
tfidf_p = model_p.transform(tfidf_p)

# 输出 tf-idf 信息到文本文件

In [8]:
with open('tf-idf.txt', 'w') as f:
    for i in idx_w:
        f.write(i + '：' + str(tfidf_w[idx_w[i]]) + '\n')
    for idx_p, p in enumerate(poems['all']):
        f.write(p['title'] + '：' + str(tfidf_p[idx_p]) + '\n')

# 近义词查找

In [None]:
def getSynonyms(key):
    a = []
    for word in idx_w:
        a.append((np.inner(tfidf_w[idx_w[key]], tfidf_w[idx_w[word]]), word))
    a.sort(reverse=True)
    r = a[0][0]
    return [(i/r, j) for i, j in a[:20]]

syn = {}
for w in idx_w:
    syn[w] = getSynonyms(w)

syn

# 关键词组匹配诗

In [None]:
# import networkx as nx

# a = []
# for poem in poems:
#     s = set()
#     for l in poem[2:]:
#         for i in range(1, 4):
#             for j in range(len(l)-i):
#                 if l[j:j+i] in tfidf_w:
#                     s.add(l[j:j+i])
#     G = nx.DiGraph()
#     for k in keywords:
#         G.add_edge('s', k+'k', capacity=1, weight=0)
#     for w in s:
#         G.add_edge(w, 't', capacity=1, weight=0)
#     for k in keywords:
#         for w in s:
#             G.add_edge(k+'k', w, capacity=1, weight=-int(np.inner(tfidf_w[k], tfidf_w[w])*10000))
#     flow = nx.max_flow_min_cost(G, 's', 't')
#     a.append((nx.cost_of_flow(G, flow), poem))

# a.sort()
# for p in a[:5]:
#     print(p)

def _filterPoems_tfidf(keywords, poems):
    if keywords==[]:
        return poems
    vec = np.zeros((len(idx_w),), dtype=np.float64)
    for word in idx_w:
        for k in keywords:
            v1, v2 = tfidf_w[idx_w[k]], tfidf_w[idx_w[word]]
            cos = np.inner(v1, v2)/np.linalg.norm(v1)/np.linalg.norm(v2)
            cos = min(1, max(cos, -1))
            vec[idx_w[word]] += np.power(np.pi-np.arccos(cos), 5)
    vec /= np.linalg.norm(vec)
    vec = model_p.transform([vec])[0]
    a = []
    for idx_p, p in enumerate(poems):
        a.append((np.inner(tfidf_p[idx_p], vec), p))
    a.sort(key=lambda x: -x[0])
    return [x[1] for x in a[:20]], keywords

def _filterPoems_2(keywords, poems):
    wordWeight = {}
    for k in keywords:
        for w, word in syn[k]:
            wordWeight[word] = wordWeight.get(word, 0)+w
    a = []
    for p in poems:
        w = 0
        for l in p['content']:
            for i in range(1, 4):
                for j in range(len(l)-i):
                    if l[j:j+i] in wordWeight:
                        w += wordWeight[l[j:j+i]]
        a.append((w, p))
    a.sort(key=lambda x: -x[0])
    return a[:20], list(wordWeight)

def _filterPoems(keywords, poems):
    if keywords==[]:
        return poems, []
    return _filterPoems_2(keywords, poems)

def filterPoems(x):
    x = x.strip().split(' ')
    if x[0] in poems:
        return _filterPoems(x[1:], poems[x[0]])
    else:
        return _filterPoems(x, poems['all'])

filterPoems(' '.join(['柳', '春', '绿']))

In [None]:
# np.savez('tfidf', tfidf_p=tfidf_p, tfidf_w=tfidf_w)

In [None]:
# np.save('tfidf', tfidf_w)