In [5]:
import pickle
import re

import pandas as pd
#from deeppavlov.models.tokenizers.ru_tokenizer import RussianTokenizer
#from deeppavlov.models.vectorizers.tfidf_vectorizer import TfIdfVectorizer
from nltk.corpus import stopwords
from sklearn.linear_model import SGDClassifier

from app.models import ML_Model

STOPWORDS = stopwords.words('russian')

ModuleNotFoundError: No module named 'app'

In [2]:
class DataModel():
    def __init__(self):
        self.model = None
        self.app = None

    def init_app(self, app):
        self.app = app

        try:
            self.load_from_db()

        except AttributeError:
            pass

    def load_from_db(self):
        self.model = pickle.loads(ML_Model.objects.first().data)

    def save_to_db(self):
        try:
            model = ML_Model.objects.first()
            model.data = pickle.dumps(self.model)

        except AttributeError:
            model = ML_Model(data=pickle.dumps(self.model))

        finally:
            model.save()

    def train_model(self, path):
        train = pd.read_excel(path)
        train['важность'] = train['важность'].fillna(0)
        train.fillna('None', inplace=True)

        x = train['заголовок'] + ' ' + train['текст']

        tokenizer = RussianTokenizer(lemmas=True, stopwords=STOPWORDS)
        vectorizer = TfIdfVectorizer(mode='train')
        vectorizer.fit(tokenizer(x))

        x_train = vectorizer(tokenizer(x))
        y_train = train['важность']

        model = SGDClassifier(alpha=10 ** (-4), loss='log', penalty='l2',
                              max_iter=180, n_jobs=-1, random_state=42)

        model.fit(x_train, y_train)

        dict_model = {'porog': 0.29,
                      'tokenizer': tokenizer,
                      'vectorizer': vectorizer,
                      'model': model}

        self.model = dict_model

    def predict(self, excel_data, companies, personalities):
        people, company, good_index, news = self._get_good_news(excel_data, companies, personalities)

        porog = self.model['porog']
        tokenizer = self.model['tokenizer']
        vectorizer = self.model['vectorizer']
        model = self.model['model']

        x = (news.loc[good_index, 'Заголовок'] + ' ' + news.loc[good_index, 'Полный текст']).apply(self._remove_punct)
        x = vectorizer(tokenizer(x.values))

        predict = model.predict_proba(x)[:, 1] > porog
        predict = dict(zip(good_index, predict))

        result = []
        for i in news.index:

            answer = {'title': news.loc[i, 'Заголовок'],
                      'date': news.loc[i, "Дата"],
                      'text': news.loc[i, 'Полный текст'],
                      'source': news.loc[i, 'Площадка'],
                      'link': news.loc[i, 'URL поста'],
                      'is_important': False,
                      'personalities': people[i],
                      'organizations': company[i]
                      }

            if i in good_index:
                answer['is_important'] = predict[i]

            result.append(answer)

        return result

    def _get_good_news(self, excel_data, companies, personalities):
        news = pd.read_excel(excel_data, sheet_name='ReportResultNew')

        all_text = (news['Заголовок'] + ' ' + news['Полный текст']).apply(self._remove_punct)

        personalities = all_text.apply(self._find_key, args=(personalities,))
        companies = all_text.apply(self._find_key, args=(companies,))

        good_index = []
        for i in news.index:
            if any([personalities[i], companies[i]]):
                good_index.append(i)

        return personalities, companies, good_index, news

    def _remove_punct(self, message):
        message = str(message)
        if len(message) == 0:
            message_wo_punct = message
        else:
            message_wo_punct = re.sub(r'[^\w\s]', ' ', message).lower()
        return message_wo_punct

    def _find_key(self, text, keywords):
        result = []
        for key in keywords:
            if text.find(self._remove_punct(key)) > -1:
                result.append(key)
        return result

In [3]:
class ModelDoesNotTrain(Exception):
    def __init__(self, message):
        super().__init__(message)