## Import

In [1]:
import numpy as np
import pandas as pd
import pickle
import re

from sklearn.linear_model import SGDClassifier

from deeppavlov.models.vectorizers.tfidf_vectorizer import TfIdfVectorizer
from deeppavlov.models.tokenizers.ru_tokenizer import RussianTokenizer

from nltk.corpus import stopwords
STOPWORDS = stopwords.words('russian')

[nltk_data] Downloading package punkt to /home/whale/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/whale/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/whale/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/whale/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


## Preprocessing

In [2]:
def remove_punct(message):
    message = str(message)
    if len(message) == 0:
        message_wo_punct = message
    else:
        message_wo_punct = re.sub(r'[^\w\s]', ' ', message).lower()
    return message_wo_punct

In [3]:
def read_dataset(path):
    news = pd.read_excel(path, sheet_name='Новости')

    company = pd.read_excel(path, sheet_name='Компании', header=None).fillna('')
    people = pd.read_excel(path, sheet_name='Аффилированности', header=None).fillna('')

    all_company = np.concatenate(company.values)
    all_people = np.concatenate(people.values)

    all_people = [remove_punct(key) for key in all_people if key != '']
    all_company = [remove_punct(key) for key in all_company if key != '']

    return news, all_people, all_company

In [4]:
def find_key(text, keywords):
    result = []
    for key in keywords:
        if text.find(key) > -1:
            result.append(key)
    return result

In [5]:
def capital_letter(text_list):
    result = []
    for text in text_list:
        result.append([x.title() for x in text])
    return result

In [6]:
def get_good_news(path):
    news, people, company = read_dataset(path)

    all_text = (news['заголовок'] + ' ' + news['текст']).apply(remove_punct)

    people = all_text.apply(find_key, args=(people,))
    company = all_text.apply(find_key, args=(company,))

    people = capital_letter(people)
    company = capital_letter(company)

    good_index = []
    for i in news.index:
        if any([people[i], company[i]]):
            good_index.append(i)

    return people, company, good_index, news

## Train function

In [7]:
def train_model(path):
    train = pd.read_excel(path)
    train['важность'] = train['важность'].fillna(0)
    train.fillna('None', inplace=True)

    x = train['заголовок'] + ' ' + train['текст']

    tokenizer = RussianTokenizer(lemmas=True, stopwords=STOPWORDS)
    vectorizer = TfIdfVectorizer(mode='train')
    vectorizer.fit(tokenizer(x))

    x_train = vectorizer(tokenizer(x))
    y_train = train['важность']

    model = SGDClassifier(alpha=10 ** (-4), loss='log', penalty='l2',
                          max_iter=180, n_jobs=-1, random_state=42)

    model.fit(x_train, y_train)

    dict_model = {'porog': 0.25,
                  'tokenizer': tokenizer,
                  'vectorizer': vectorizer,
                  'model': model}

    filename = 'alfa_model.pickle'
    pickle.dump(dict_model, open(filename, 'wb'))
    return True

## Predict function

In [8]:
def predict_model(path):
    people, company, good_index, news = get_good_news(path)

    model_name = 'alfa_model.pickle'
    dict_model = pickle.load(open(model_name, 'rb'))

    porog = dict_model['porog']
    tokenizer = dict_model['tokenizer']
    vectorizer = dict_model['vectorizer']
    model = dict_model['model']

    x = (news.loc[good_index, 'заголовок'] + ' ' + news.loc[good_index, 'текст']).apply(remove_punct)
    x = vectorizer(tokenizer(x.values))

    predict = model.predict_proba(x)[:, 1] > porog
    predict = dict(zip(good_index, predict))

    result = []
    for i in news.index:

        answer = {'title': news.loc[i, 'заголовок'],
                  'date': news.loc[i, 'дата'],
                  'text': news.loc[i, 'текст'],
                  'is_important': False,
                  'personalities': people[i],
                  'organizations': company[i]
                  }

        if i in good_index:
            answer['is_important'] = predict[i]

        result.append(answer)
    return result

In [9]:
train_model('train.xlsx')

2019-01-13 18:56:32.914 INFO in 'pymorphy2.opencorpora_dict.wrapper'['wrapper'] at line 16: Loading dictionaries from /home/whale/anaconda3/lib/python3.6/site-packages/pymorphy2_dicts/data
2019-01-13 18:56:32.947 INFO in 'pymorphy2.opencorpora_dict.wrapper'['wrapper'] at line 20: format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


True

In [10]:
pred = predict_model('test.xlsx')

2019-01-13 18:56:40.87 INFO in 'pymorphy2.opencorpora_dict.wrapper'['wrapper'] at line 16: Loading dictionaries from /home/whale/anaconda3/lib/python3.6/site-packages/pymorphy2_dicts/data
2019-01-13 18:56:40.118 INFO in 'pymorphy2.opencorpora_dict.wrapper'['wrapper'] at line 20: format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


In [11]:
prediction = [x['is_important'] for x in pred]

In [12]:
y_true = pd.read_excel('test.xlsx')['важность'].values

In [13]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

## Score

In [14]:
print('accuracy_score = {}'.format(accuracy_score(y_true, prediction)))
print('recall_score = {}'.format(recall_score(y_true, prediction)))
print('precision_score = {}'.format(precision_score(y_true, prediction)))
print('f1_score_weighted = {}'.format(f1_score(y_true, prediction, 'weighted')))

accuracy_score = 0.8215297450424929
recall_score = 0.9485294117647058
precision_score = 0.6972972972972973
f1_score_weighted = 0.8037383177570093
