In [None]:
# %%writefile flask_similar_abstract.py

from flask import Flask, request, jsonify
import pandas as pd
import numpy as np
import annoy
import nltk
import re
from gensim.models import word2vec
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import pickle


app = Flask(__name__)

def lem(text):
    return PorterStemmer().stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def tok_abstr(abstr, tokenizer):
    return [ s.replace('.', '') for s in tokenizer.tokenize(abstr.strip())]

def convert(abstract):
    '''Преобразует текст для w2v'''
    text = re.sub("[^a-z.!?]"," ", abstract)
    words = text.lower().split()
    words = [lem(w.replace('.', '')) for w in words if not w in STOPWORDS and len(w) > 3]
    return ' '.join(words)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

def avg_tfidf(snt, words, model, size, weight):
    vector = np.zeros(size)
    counter = 1
    for w in snt:
        if w in words:
            vector += model[w] * weight[w][0, 0] 
            counter += 1
    return vector / counter

def init_sim_arctiles(filename):
    df = pd.read_csv('data', sep='@')
    df = df.dropna()
    vec_len = 400
    print('Preparing data...')
    df['cl_abst'] = df.Abstract.apply(convert)
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences  = [sent.replace('.', '').split() for row in df.cl_abst for sent in tok_abstr(row, tokenizer) if len(sent) > 1]
    model = word2vec.Word2Vec(sentences, workers=4, size = vec_len, window = 5)
    words = set(model.wv.index2word)
    df['new'] = df.cl_abst.apply(lambda x: x.replace('.', '').split())
    print('Building a model...')
    _list = list(df.cl_abst.apply(lambda x : x.replace('.', '')))
    vect = TfidfVectorizer(ngram_range=(0,1))
    tfidf = vect.fit_transform(_list)
    weight = load_obj('weight')
#     true_weigth = lambda x :tfidf[:, x].todense().max(axis=0) * np.log10(tfidf.shape[0] / tfidf[:, x].count_nonzero())
#     weight = {}
#     idx = 0
#     for w in vect.get_feature_names():
#         weight[w] = true_weigth(idx)
#         idx += 1
#         print(idx / tfidf.shape[1], end='\r')
    df['vectors_tfidf'] = df.new.apply(lambda x : avg_tfidf(x, words, model, vec_len, weight))
    return (vec_len, words, model, df, weight)

def add_art(df, num):
    art = {}
    for i in num:
        art[df.Title.iloc[i]] = df.Link.iloc[i]
    return art

vec_len, words, model, df, weight = init_sim_arctiles('data')
@app.route("/get_similar_articles/<string>", methods=['GET', 'POST'])
def return_art(string):
#     vec_len, words, model, df, weight = init_sim_arctiles('data')
    print('Making annoy tree...')
    obj = annoy.AnnoyIndex(vec_len)
    for i, v in enumerate(df.vectors_tfidf):
        obj.add_item(i, v)
    obj.build(20)
    try:
        print('Trying return guesses...')
        w = convert(string)
        w = w.replace('.', '').split()
        vec = avg_tfidf(w, words, model, vec_len, weight)
        num = obj.get_nns_by_vector(vec, 5)
        guess = add_art(df, num)
    except: 
        guess = "Nothing was found"
    return jsonify(guess)

@app.route("/")
def hello():
    return 'Hi there. Try ~ /get_similar_articles/<string>'
    
if __name__ == "__main__":
    app.run()    

Preparing data...
Building a model...




 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [28/Nov/2018 10:15:16] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2018 10:15:17] "GET /favicon.ico HTTP/1.1" 404 -


Making annoy tree...


127.0.0.1 - - [28/Nov/2018 10:15:28] "GET /get_similar_articles/dark%20matter HTTP/1.1" 200 -


Trying return guesses...
