In [808]:
import os
import jieba
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from collections import Counter
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from scipy.linalg import norm
from sklearn.feature_extraction.text import TfidfVectorizer

In [809]:
data = pd.DataFrame(columns=['title', 'article', 'article_cut'])

In [810]:
file_name_train = 'F:\\NLPCC2015Eval-Task4-AllData\\sample data\\news.sentences'

In [811]:
file_name_test = 'F:\\NLPCC2015Eval-Task4-AllData\\TestDataWithReferenceSummaries\\news.sentences'

In [812]:
def data_prepare(file_name, data_list):
    for root, dirs, files in os.walk(file_name):
        for file in files:
            with open(os.path.join(file_name, file), 'r', encoding='utf-8') as f:
                lines = f.readlines()
                title = lines[0].strip('\n')
                article = []
                for number in range(len(lines[2:])):
                    article.append(lines[number+2].strip('\n'))
                row = {"title":title, "article":article}
                data_list = data_list.append(row, ignore_index=True)
    return data_list

In [813]:
data = data_prepare(file_name_train, data)
data = data_prepare(file_name_test, data)

In [814]:
def stopwordslist(filepath):
    #使用哈工大停用词词典
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

In [815]:
def seg_sentence(sentence):  # 输出分词后str,分成单个词存储在list中
    #使用jieba分词对句子进行切分
    sentence_seged = jieba.cut(sentence.strip())
    stopwords = stopwordslist('stopwords.txt')
    outstr = ''
    out_list = []
    for word in sentence_seged:
        if word not in stopwords:
            if word != " ":
                outstr += word
                outstr += " "
                out_list.append(word)
    return outstr, out_list

In [816]:
#计算句子与标题相似度
def tfidf_similarity(article_list, title):
    article_cut = []
    title, title_list = seg_sentence(title)
    for i in range(len(article_list)):
        article, article_list_all = seg_sentence(article_list[i])
        article_cut.append(article)
    # 转化为TF矩阵
    cv = TfidfVectorizer(tokenizer=lambda s: s.split())
    article_cut.append(title)
    vectors = cv.fit_transform(article_cut).toarray()
    similarity_list = []
    for num in range(len(article_cut)-1):
        similarity_list.append(np.dot(vectors[num], vectors[-1]) / (norm(vectors[num]) * norm(vectors[-1])))
    # 计算TF系数
    return similarity_list

In [817]:
a = tfidf_similarity(data['article'][0], data['title'][0])

In [818]:
def TF_count(article_list, title):
    # 构建词表，进行词频统计
    article_all = []   #所有词放入一个列表中
    article_all_part = [] # 每句话存放在一个list中
    title, title_list = seg_sentence(title)
    for i in range(len(article_list)):
        article, article_cut_list = seg_sentence(article_list[i])
        article_all_part.append(article_cut_list)
        article_all.extend(article_cut_list)
    article_all.extend(title_list)
    article_all_part.append(title_list)
    counter = Counter(article_all)
    # 每句话进行词频统计，获取词频得分
    TF_point_list = []
    TF_point = 0
    for number in range(len(article_list)):
        for word in article_all_part[number]:
            if word in counter.keys(): 
                 TF_point += counter[word]/len(article_all)
        TF_point_list.append(TF_point)
        TF_point = 0
    return TF_point_list

In [819]:
b = TF_count(data['article'][0], data['title'][0])

In [820]:
def Doc2vec_train(article_list, title, size=100, epoch_num=1):
    # 句子进行jieba分词，按单词存放在list中
    article_all_part = []
    # 对分词后的句子进行
    article_all_part_Tagg = []
    title, title_list = seg_sentence(title)
    for i in range(len(article_list)):
        article, article_cut_list = seg_sentence(article_list[i])
        article_all_part.append(article_cut_list)
        article = TaggedDocument(article_cut_list, tags=[i])
        article_all_part_Tagg.append(article)
    article_all_part.append(title_list)
    article_all_part_Tagg.append(TaggedDocument(title_list, tags=[len(article_list)]))
    print(article_all_part_Tagg)
    model = Doc2Vec(article_all_part_Tagg, min_count=1, window=5, size=size, sample=1e-3, negative=5, workers=4)    
    model.train(article_all_part_Tagg, total_examples=model.corpus_count, epochs=70)    
    inferred_vector_dm = model.infer_vector(article_all_part[-1]) 
    print(inferred_vector_dm)
    return None

In [821]:
d = Doc2vec_train(data['article'][0], data['title'][0])

[TaggedDocument(words=['当下', '现实', '语境', '中', '故乡', '语义', '丰富', '模糊', '词语'], tags=[0]), TaggedDocument(words=['一年一度', '春运', '大潮', '显然', '不仅仅', '是因为', '返乡', '动因', '使然'], tags=[1]), TaggedDocument(words=['全球化', '城市化', '进程', '中', '人', '都', '不能不', '面对', '故乡', '异乡', '流转', '变迁', '从未', '离开', '家园', '高速', '发展', '物质', '社会', '中', '不断', '蜕变'], tags=[2]), TaggedDocument(words=['吾', '城吾乡', '传统', '承袭', '故园', '情怀', '重新', '自我', '定位', '背后', '既有', '个人', '生命', '体验', '亦', '家国', '命运', '折射'], tags=[3]), TaggedDocument(words=['选择', '五座', '城乡', '八位', '摄影师', '两', '两', '对照', '持续', '数年', '拍摄', '影像', '阐释', '这一', '命题'], tags=[4]), TaggedDocument(words=['摄影', '一只', '手', '悄悄地', '现实', '精神', '世界', '中', '放下', '地标'], tags=[5]), TaggedDocument(words=['喧嚣', '骚动', '中', '人', '或许', '都', '某种意义', '上', '异乡人', '摄影', '能否', '成为', '微弱', '清晰', '声音', '说', '看', '吾', '城吾乡'], tags=[6]), TaggedDocument(words=['专题', '按语', '李楠', '深圳', '南山区', '南山村', '南园', '村', '北头', '村', '三村', '连成一片', '形成', '深圳', '最大', '城中村'], tags=[7]), TaggedDocument(words



[-0.00364307 -0.00871885 -0.05591159  0.00730042 -0.11123246  0.05568729
  0.06112423 -0.05820591 -0.02621388 -0.0331296  -0.09972227 -0.01307449
 -0.09393863 -0.02301978  0.03170907 -0.07388121  0.02834924 -0.03853439
 -0.1326794   0.00072193  0.01025284  0.03145637 -0.02512902 -0.01034301
  0.18336163 -0.07763074  0.03072446 -0.00255158 -0.16200973 -0.0154889
 -0.09335517  0.12684824  0.24654053  0.08668745 -0.0284239   0.11337172
 -0.12139861 -0.03166172  0.02999217 -0.05441087  0.03589702 -0.05319728
  0.00913267 -0.06964517 -0.05773924  0.05407128  0.07152052  0.07352702
 -0.0049653   0.07118604 -0.14689822  0.01514876 -0.12887043  0.09488627
  0.06232539  0.04903065 -0.0350323  -0.01939493 -0.07329221 -0.05021406
 -0.01335816  0.02182754  0.02879551  0.01953697  0.11973416  0.03450743
 -0.04341694  0.06235024 -0.1115652   0.06580329  0.13365054 -0.0864237
  0.0634345   0.11088933  0.00438426  0.11475363  0.00958979 -0.09217833
  0.02409441  0.09631451 -0.12011264 -0.01077791  0.0