In [77]:
from flask import Flask, request, jsonify
import pandas as pd
import numpy as np
import annoy
import nltk
import re
from gensim.models import word2vec
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer

data = pd.read_csv('data', sep='@')
# df = pd.concat((data[data.Year == 2016], data[data.Year == 2015]), axis=0, ignore_index=True)
df = data.copy()
df = df.dropna()

In [78]:
def lem(text):
    return PorterStemmer().stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def convert(abstract):
    '''Преобразует текст для w2v'''
    text = re.sub("[^a-z.!?]"," ", abstract)
    words = text.lower().split()
    words = [lem(w.replace('.', '')) for w in words if not w in STOPWORDS and len(w) > 3]
    return ' '.join(words)

vec_len = 400
df['cl_abst'] = df.Abstract.apply(convert)
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def tok_abstr(abstr, tokenizer):
    return [ s.replace('.', '') for s in tokenizer.tokenize(abstr.strip())]

In [79]:
sentences  = [sent.replace('.', '').split() for row in df.cl_abst for sent in tok_abstr(row, tokenizer) if len(sent) > 2]
model = word2vec.Word2Vec(sentences, workers=4, size = vec_len, window = 5)

In [90]:
def avg_single_sentence(snt, words, model, size):
    vector = np.zeros(size)
    counter = 1
    for w in snt:
        if w in words:
            vector += model[w]# * weight[w][0, 0] # Хранится как матрица 
            counter += 1
    return vector / counter

In [91]:
# words = set([i for i in model.wv.index2word if len(i) > 1])
words = set(model.wv.index2word)
df['new'] = df.cl_abst.apply(lambda x: x.replace('.', '').split())
df['vectors'] = df.new.apply(lambda x : avg_single_sentence(x, words, model, vec_len))

  


In [108]:
t = annoy.AnnoyIndex(vec_len)
for i, v in enumerate(df.vectors):
    t.add_item(i, v)
t.build(20)

test= convert('black hole')
test = test.replace('.', '').split()
vec = avg_single_sentence(test, words, model, vec_len)
idx, dist = t.get_nns_by_vector(vec, 30, include_distances=True)

  


In [92]:
_list = list(df.cl_abst.apply(lambda x : x.replace('.', '')))
vect = TfidfVectorizer(ngram_range=(0,1))
tfidf = vect.fit_transform(_list)

true_weigth = lambda x :tfidf[:, x].todense().max(axis=0) * np.log10(tfidf.shape[0] / tfidf[:, x].count_nonzero())
weight = {}
idx = 0
for w in vect.get_feature_names():
    weight[w] = true_weigth(idx)
    idx += 1
    print(idx / tfidf.shape[1], end='\r')

1.0999140967270853644

In [84]:
def avg_tfidf(snt, words, model, size, weight):
    vector = np.zeros(size)
    counter = 1
    for w in snt:
        if w in words:
            vector += model[w] * weight[w][0, 0] # Хранится как матрица 
            counter += 1
    return vector / counter

In [85]:
df['vectors_tfidf'] = df.new.apply(lambda x : avg_tfidf(x, words, model, vec_len, weight))

  


In [109]:
t = annoy.AnnoyIndex(vec_len)
for i, v in enumerate(df.vectors_tfidf):
    t.add_item(i, v)
t.build(20)

test= convert('black hole')
test = test.replace('.', '').split()
vec = avg_tfidf(test, words, model, vec_len, weight)
idx_tf, dist_tf = t.get_nns_by_vector(vec, 30, include_distances=True)

  


In [112]:
df.Link[858]

'https://elibrary.ru/item.asp?id=25163288'

In [93]:
def pprint(results, s):
    print('{} model'.format(s))
    print('indexes: {}\ndistance:{}\n'.format(results[0], np.round(results[1], 4)))

In [95]:
idx

11641

In [110]:
pprint([idx, dist], 'single')
pprint([idx_tf, dist_tf], 'TF-IDF')

single model
indexes: [7193, 7022, 1687, 2026, 455, 6174, 2888, 1826, 5041, 2762, 4811, 5498, 126, 0, 3780, 3596, 1793, 3278, 2571, 330, 1532, 264, 6888, 6, 3768, 1228, 1112, 2108, 1, 4652]
distance:[0.7887 0.7967 0.8117 0.8155 0.823  0.8252 0.8698 0.881  0.8874 0.8929
 0.9116 0.9122 0.9203 0.9321 0.9434 0.9539 0.9607 0.9625 0.9634 0.9654
 0.9957 0.9988 1.0076 1.0125 1.0131 1.0255 1.031  1.0328 1.0377 1.0385]

TF-IDF model
indexes: [7022, 6174, 455, 2026, 1687, 509, 2888, 1826, 2050, 3596, 4811, 126, 2571, 3780, 2762, 5041, 1526, 0, 1793, 330, 4652, 2108, 5498, 3768, 4517, 6, 8293, 1532, 1228, 858]
distance:[0.6533 0.7214 0.7318 0.7531 0.7726 0.7746 0.8104 0.8168 0.8236 0.8324
 0.8374 0.8406 0.8515 0.8566 0.8675 0.8687 0.8703 0.875  0.882  0.8822
 0.8928 0.9077 0.9084 0.9193 0.9218 0.9414 0.9482 0.9491 0.9513 0.9541]



In [88]:
pprint([idx_tf, dist_tf], 'TF-IDF')

TF-IDF model
indexes: [7022, 6174, 455, 2026, 1687, 509, 1826, 126, 2571, 3780]
distance:[0.6533 0.7214 0.7318 0.7531 0.7726 0.7746 0.8168 0.8406 0.8515 0.8566]



In [113]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


In [114]:
save_obj(weight, 'weight')