In [1]:
import pandas as pd
from tqdm import tqdm,tgrange
import os
import gensim
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
TaggededDocument = gensim.models.doc2vec.TaggedDocument

unable to import 'smart_open.gcs', disabling that module


In [2]:
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# 预训练模型

## 准备预训练语料：G-06-F-17类

In [24]:
corpus_filename = "/home/hxjiang/Pythonworkspace/patent/sample20000/after_process.xlsx"
corpus = pd.read_excel(corpus_filename, encoding='utf-8')

In [25]:
corpus.head(1)

Unnamed: 0,abstract,application_id,claims,location,title,claims_add1,abstract_sen_count,claims_sen_count,abstract_final,abstract_count,claims_final,claims_count
0,"['The', 'present', 'invention', 'provides', 'a...",13027052,"['1', '.', 'A', 'hand-utility', 'interface', '...",2012/ipa120105/US20120000001A1.xml,HAND UTILITY INTERFACE,,2,29,"['The', 'present', 'invention', 'provide', 'in...",34,"['A', 'interface', 'use', 'protect', 'users', ...",383


In [26]:
print("number of corpus："+str(len(corpus)))

number of corpus：20000


## 分词等清洗工作

In [None]:
corpus['description'].dropna(inplace=True)
# corpus['description'] = [entry.lower() for entry in corpus['description']]

In [None]:
corpus['description'] = [ word_tokenize(entry) for entry in corpus['description'] ]

In [None]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV


for index, entry in enumerate(tqdm(corpus['description'], ncols=60)):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]]) # 对这三类做词形还原
            Final_words.append(word_Final)
    corpus.loc[index,'description_final'] = str(Final_words)

## 将语料整理成规定的形式

In [27]:
x_train = []

for i, text in enumerate(tqdm(corpus['claims_final'], ncols=60)):
    document = TaggededDocument(text, tags=[i])
    x_train.append(document)

100%|█████████████| 20000/20000 [00:00<00:00, 443028.30it/s]


In [28]:
len(x_train)

20000

## 模型训练

In [29]:
def train(x_train):
    doc_model = Doc2Vec(x_train, min_count=1, window=10, vector_size=72, sample=1e-3, negative=5, workers=2)
    doc_model.train(x_train, total_examples=doc_model.corpus_count, epochs=50) # corpus_count是文件个数，epochs训练次数
    return doc_model

In [30]:
doc_model = train(x_train)

In [31]:
doc_model.save("claims.model")

# 测试模型效果

In [32]:
# 计算余弦相似度
def cosine_similarity(vector1, vector2):
    dot_product = 0.0
    normA = 0.0
    normB = 0.0
    for a, b in zip(vector1, vector2):
        dot_product += a * b
        normA += a ** 2
        normB += b ** 2
    if normA == 0.0 or normB == 0.0:
        return 0
    else:
        return round(dot_product / ((normA**0.5)*(normB**0.5)) * 100, 2)

In [33]:
# 5.得到两个句子的向量后使用余弦相似度
text1 = "One embodiment of the present invention provides a system for facilitating social networking based on fashion-related information. During operation, the system receives fashion-related information from a user. Next, the system extracts the user\\\'s fashion preferences from the received information and compares the user\\\'s fashion preference with other users\\\' fashion preferences. Finally, the system groups users based on similarity of their fashion preferences."
text2 = "novel multimodal learning setting of company earnings conference call"
text1 = text1.split()
text2 = text2.split()

In [34]:
doc_model = Doc2Vec.load("claims.model")
text1_inferred_vector = doc_model.infer_vector(text1)
doc_model = Doc2Vec.load("claims.model")
text2_inferred_vector = doc_model.infer_vector(text2)
cos = cosine_similarity(text1_inferred_vector, text2_inferred_vector)

In [35]:
cos

9.3

# 得到文档向量

In [36]:
corpus.head(1)

Unnamed: 0,abstract,application_id,claims,location,title,claims_add1,abstract_sen_count,claims_sen_count,abstract_final,abstract_count,claims_final,claims_count
0,"['The', 'present', 'invention', 'provides', 'a...",13027052,"['1', '.', 'A', 'hand-utility', 'interface', '...",2012/ipa120105/US20120000001A1.xml,HAND UTILITY INTERFACE,,2,29,"['The', 'present', 'invention', 'provide', 'in...",34,"['A', 'interface', 'use', 'protect', 'users', ...",383


In [37]:
print("number of corpus："+str(len(corpus['abstract'])))

number of corpus：20000


## abstract

In [20]:
corpus['abstract'].dropna(inplace=True)
# load_data['abstract'] = [entry.lower() for entry in load_data['abstract']]
corpus['abstract'] = [word_tokenize(entry) for entry in corpus['abstract']]

In [None]:
embedding_file = "/home/hxjiang/Pythonworkspace/patent/sample20000/abstract_doc2vec.csv"

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index, entry in enumerate(tqdm(corpus['abstract'], ncols=60)):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    doc_model = Doc2Vec.load("abstract.model")
    text_inferred_vector = doc_model.infer_vector(Final_words)
    text_inferred_vector = text_inferred_vector * 100
    vector_df = pd.DataFrame([text_inferred_vector])
    if os.path.exists(embedding_file):
        vector_df.to_csv(embedding_file, header=0, mode='a', index=False, sep=',')
    else:
        vector_df.to_csv(embedding_file, mode='a', index=False, sep=',')

 98%|████████████████▋| 19623/20000 [45:01<01:05,  5.74it/s]

## claims

In [38]:
corpus['claims'].dropna(inplace=True)
# load_data['claims'] = [entry.lower() for entry in load_data['claims']]
corpus['claims'] = [word_tokenize(entry) for entry in corpus['claims']]

In [41]:
embedding_file = "/home/hxjiang/Pythonworkspace/patent/sample20000/claims_doc2vec.csv"

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index, entry in enumerate(tqdm(corpus['claims'], ncols=60)):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    doc_model = Doc2Vec.load("claims.model")
    text_inferred_vector = doc_model.infer_vector(Final_words)
    text_inferred_vector = text_inferred_vector * 100
    vector_df = pd.DataFrame([text_inferred_vector])
    if os.path.exists(embedding_file):
        vector_df.to_csv(embedding_file, header=0, mode='a', index=False, sep=',')
    else:
        vector_df.to_csv(embedding_file, mode='a', index=False, sep=',')

100%|███████████████| 20000/20000 [4:06:02<00:00,  1.35it/s]
