In [1]:
from read import read
from db import words as get_words
from file import save, save_idfs
from preprocess import preprocess
from vectorize import tfidf_vectorizer, count_vectorizer, binary_vectorizer, vectorize, idf_vectorizer
from metrics import calc_metrics, cossine, jaccard, dice, euclide
from rocchio import rocchio
from db import words

In [2]:
import os
import ipywidgets as ws
import pandas as pd
from IPython.display import display

In [3]:
file_loader = ws.FileUpload(multiple=False)
drop = ws.Dropdown(options=[("Бин.", binary_vectorizer), ("TF", count_vectorizer), ("TF-IDF", tfidf_vectorizer)], description="Векторизатор:")
btn1 = ws.Button(description="Построить модель")
btn2 = ws.Button(description="Показать результаты")
btn3 = ws.Button(description="Поиск")
text = ws.Textarea()
out = ws.Output()

In [4]:
model, vectorizer, db = None, None, None

In [5]:
def on_btn_clicked(btn):
    global model
    global vectorizer
    global db
    with out:
        out.clear_output()
        ext = os.path.splitext(list(file_loader.value.keys())[0])[1]
        fn = ".tmp" + ext
        with open(fn, "wb") as f:
            f.write(list(file_loader.value.values())[0]['content'])
        model, vectorizer, db = vectorize(drop.value, preprocess(read(fn)))
        save(db, "data/.tmp.1db.csv")
        os.remove(fn)
        display("Индексирование завершено!!!")

def show_btn_clicked(btn):
    with out:
        out.clear_output()
        tmp = list(model)[:10]
        tmp = list(
            {
                item: [text.get(item, 0) for text in tmp]
                for item in list(tmp[0].keys())[:30]
            }.items()
        )
        tmp.sort(key=lambda x: x[1][0], reverse=True)
        
        display(pd.DataFrame(dict(tmp[:10])).transpose())

def metrics_btn_clicked(btn):
    with out:
        out.clear_output()
        words = {
            cossine: "Косинусная",
            jaccard: "Жаккара",
            dice: "Дайса"
        }
        res = pd.DataFrame({
            words[metric]: calc_metrics(metric, list(preprocess([text.value]))[0], model, vectorizer, get_words(db)) for metric in (cossine, jaccard, dice)
        })
        display(res)

btn1.on_click(on_btn_clicked)
btn2.on_click(show_btn_clicked)
btn3.on_click(metrics_btn_clicked)

In [6]:
display(file_loader, drop, btn1, btn2, text, btn3, out)

FileUpload(value={}, description='Upload')

Dropdown(description='Векторизатор:', options=(('Бин.', <function binary_vectorizer at 0x7f1cf4eed160>), ('TF'…

Button(description='Построить модель', style=ButtonStyle())

Button(description='Показать результаты', style=ButtonStyle())

Textarea(value='')

Button(description='Поиск', style=ButtonStyle())

Output()

In [7]:
text2 = ws.Textarea()
btn4 = ws.Button(description="Подобрать")
out2 = ws.Output()

In [8]:
import preprocess as p
import json

with open("data/stopwords.json") as f:
    stop_words = json.load(f)

p.stop_words = stop_words
p.preprocessors.append(p.rm_stop_words)
    
sport_texts = list(read("data/sport_all.txt"))
tech_texts = list(read("data/tech_all.txt"))
class_names = ("Спорт", "Технологии")
classes = (range(len(sport_texts)), range(len(sport_texts), len(sport_texts) + len(tech_texts)))
rocc_model, rocc_vec, rocc_db = vectorize(tfidf_vectorizer, p.preprocess(sport_texts + tech_texts))
idfs = idf_vectorizer(rocc_db)
save(rocc_model, "data/.tmp.4model.csv")
save(rocc_db, "data/.tmp.4db.csv")
save_idfs(idfs, "data/.tmp.4idfs.json")

In [9]:
def rocchio_test(btn):
    with out2:
        out2.clear_output()
        res = pd.DataFrame(zip(class_names, tuple(
            rocchio(
                list(p.preprocess([text2.value]))[0],
                rocc_model,
                words(rocc_db),
                rocc_vec,
                classes,
                euclide,
            )
        )),columns=("Класс", "Расстояние"))
        display(res.sort_values(by="Расстояние"))

btn4.on_click(rocchio_test)

In [10]:
display(text2, btn4, out2)

Textarea(value='')

Button(description='Подобрать', style=ButtonStyle())

Output()