In [26]:
from flask import Flask, request, jsonify
import pandas as pd
import numpy as np
import annoy
import nltk
import re
from gensim.models import word2vec
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

app = Flask(__name__)
def convert(abstract):
    '''Преобразует текст для w2v'''
    stops = set(stopwords.words("english"))
    text = re.sub("[^a-zA-Z.!?]"," ", abstract)
    words = text.lower().split()
    words = [w for w in words if not w in stops if len(w) >= 2]
    words = [PorterStemmer().stem(w) for w in words ] # lemmitization
    return ' '.join(words)

def tok_abstr(abstr, tokenizer):
    '''Разделяет абстракт на отдельные предложения'''
    return [s.replace('.', '') for s in tokenizer.tokenize(abstr.strip())]

def avg_single_sentence(snt, words, model, size):
    '''Возвращает усредненный вектор для абстракта'''
    vector = np.zeros(size)
    counter = 0
    for w in snt:
        if w in words:
            vector += model[w]
            counter += 1
    return vector / counter

def init_sim_arctiles(filename):
    ex = pd.read_csv(filename, sep='@')
    ex = ex.dropna()
    vect_len = 400
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    ex['cl_abst'] = ex.Abstract.apply(convert)  # Подготовка данных для w2v
    ex['new'] = ex.cl_abst.apply(lambda x : tok_abstr(x, tokenizer)) # будет нужно для создания list of lists для model
    sentences  = [sent.replace('.', '').split() for row in ex.cl_abst for sent in tok_abstr(row, tokenizer) if len(sent) >= 1]
    model = word2vec.Word2Vec(sentences, size=vect_len, window=50, workers=4)
    words = set(model.wv.index2word)
    vectors = [avg_single_sentence(x, words, model, vect_len) for x in ex.new]
    return (vect_len, words, model, vectors, ex)

def add_art(df, num):
    art = {}
    for i in num:
        art[df.Title.iloc[i]] = df.Link.iloc[i]
    return art

@app.route("/get_similar_articles/<string>", methods=['GET', 'POST'])
def return_art(string):
    vect_len, words, model, vectors, df = init_sim_arctiles('data')
    obj = annoy.AnnoyIndex(vect_len)
    for i, v in enumerate(vectors):
        obj.add_item(i, v)
    obj.build(20)
    try:
        w = convert(string)
        vec = avg_single_sentence(w.split(), words, model, vect_len)
        num, d = obj.get_nns_by_vector(vec, 5, include_distances=True)
        print('num', num)
        print(d)
        guess = add_art(df, num)
    except: 
        guess = "Nothing was found"
    return jsonify(guess)

@app.route("/")
def hello():
    return 'Hi there. Try ~/get_sudgest/<string> or /get_similar_articles/<string>'
    
if __name__ == "__main__":
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [26/Nov/2018 02:57:57] "GET /get_similar_articles/Dark%20MAtter%20Field%20Gravity%20repulsion HTTP/1.1" 200 -


num [7687, 2230, 2439, 2390, 5926]
[1.0177284479141235, 1.1211655139923096, 1.1211655139923096, 1.1211655139923096, 1.1557844877243042]
