In [1]:
import pymorphy2
from stop_words import get_stop_words

In [2]:
answer = """
    {
      "categorizedBooks": {
        "68": "not_interested",
        "138": "not_interested",
        "185": "",
        "263": "interested",
        "287": "interested",
        "341": "",
        "401": "not_interested",
        "414": "interested",
        "528": "interested",
        "622": "interested",
        "686": "",
        "929": "not_interested"
      }
    }
"""

In [3]:
import json

a = json.loads(answer)

In [4]:
with open("kek.json", "r", encoding="utf-8") as file:
    books = json.load(file)

In [5]:
def delete_chars(text):
    chars = ["«", "»", "…", "–", "(", ")", "[", "]", "#", ",", ".", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", ":", "+", "-", "/", "\\", "-", "!", "\"", "?"]
    
    for char in chars:
        text = str(text).replace(char, "")
        
    return text


def clean_texts(reviews):
    reviews_clear = []
    morph = pymorphy2.MorphAnalyzer()
    stop_words = get_stop_words('ru')
    
    for text in reviews:
        clear_text = ''

        text = delete_chars(text)

        for word in text.split():
            normal_form = morph.parse(word)[0].normal_form

            if not normal_form in stop_words:
                clear_text += normal_form + " "

        reviews_clear.append(clear_text)
    
    return reviews_clear

In [6]:
def find_book_by_id(book_id):
    for book in books:
        if book["id"] == book_id:
            return book

In [7]:
def get_prob_by_attr(clf, book, attr):
    
    X_train = clean_texts([book[attr]])
    
    return clf.predict(X_train, prob=True)

In [8]:
def sum_two_list(arr1, arr2):
    for ind, el in enumerate(arr2):
        arr1[ind] += el
        
    return arr1

In [9]:
def predict_by_id(clf, book_id):
    book = find_book_by_id(book_id)
    probabilities = [0 for i in range(len(clf.labels))]
    
    probability_name        = get_prob_by_attr(clf, book, "name")
    probability_description = get_prob_by_attr(clf, book, "description")
    
    probabilities = sum_two_list(probability_description, probability_name)

    return clf.labels[probabilities.index(max(probabilities))]

In [10]:
def fit_by_response(clf, response):
    X_train = []
    y_train = []
    
    attrs = ["name", "description"]
    
    for attr in attrs:
        for X in response["categorizedBooks"]:
            X_train.append(find_book_by_id(int(X))[attr])
            y_train.append(response["categorizedBooks"][X])

        X_train = clean_texts(X_train)

        clf.fit(X_train, y_train)
    
    return clf

In [11]:
import NaiveBayes
clf = NaiveBayes.NaiveBayesClassifier()

In [12]:
clf = fit_by_response(clf, a)

In [13]:
predict_by_id(clf, 401)

'not_interested'

In [14]:
a = {1: "2", 2: "3"}

In [15]:
list(a.values())

['2', '3']

In [16]:
with open("new_kek.json", "r", encoding="utf-8") as file:
    books = json.load(file)

In [19]:
for book_id in [b['id'] for b in books]:
    book = find_book_by_id(book_id)
    
    print(f"Книга {book['name']} ( {book['author']}): {predict_by_id(clf, book_id)}")

Книга Хранитель забытых вещей ( Рут Хоган ): interested
Книга Пять поросят ( Агата Кристи ): interested
Книга Бэтмен. Человек, который смеется ( Э. Брубейкер ): 
Книга 1793. История одного убийства ( Никлас Натт-о-Даг ): interested
Книга Мальчик Мотл ( Шолом-Алейхем ): not_interested
Книга Все случилось на Джеллико-роуд ( Мелина Марчетта ): interested
Книга Волшебный корабль ( Робин Хобб ): 
Книга Ученица. Предать, чтобы обрести себя ( Вестовер Тара ): not_interested
Книга Один в океане ( Слава Курилов ): interested
Книга Хлеб по водам ( Ирвин Шоу ): 
Книга Ущелье дьявола ( Александр Дюма ): not_interested
Книга Шляпа Миттерана ( Лорен Антуан ): 


In [52]:
clf.save("1.json")