In [186]:
import pandas as pd
import sqlite3
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [187]:
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset

In [188]:
import neattext as nt
import neattext.functions as nfx

In [189]:
db_name = 'recipe.db'
conn = sqlite3.connect(db_name)

table_name = 'recipe'

sql = f"SELECT r.manual, (SELECT group_concat(rc.cat_id , ', ') " \
      f"from recipe_categories rc " \
      f"WHERE rc.recipe_id = r.id) AS categories from recipe r"

categories_sql = f"SELECT id from category c "

loaded_data = pd.read_sql(sql, conn)
loaded_categories = pd.read_sql(categories_sql, conn)
#pandas data frame
conn.close()

In [190]:
#print(loaded_categories.id)

ids_cat = []
for i in range(3):
    ids_cat.append(str(loaded_categories.id[i]))

print(ids_cat)

['1', '2', '3']


In [191]:
for i in range(len(ids_cat)):
    val = []
    for j in range(len(loaded_data)):
        spl = str(loaded_data.categories[j]).split(', ')
        #print(spl)
        #print(int(str(ids_cat[i]) in spl))
        val.append(int(str(ids_cat[i]) in spl))
    loaded_data.insert(loc=len(loaded_data.columns) , column=ids_cat[i], value=val)
print(val)

In [192]:
loaded_data

Unnamed: 0,manual,categories,1,2,3
0,Шоколад разломать на кусочки и вместе со сливо...,"1, 2, 3",1,1,1
1,Положите весь творог в кастрюльку и разомните ...,"4, 5, 6",0,0,0
2,Вскипятите воду в большой кастрюле и сварите п...,"7, 8, 9, 10",0,0,0
3,Разогреть духовку. Отделить белки от желтков. ...,"1, 5, 11",1,0,0
4,Взбить яйца с сахаром.\n\nПостепенно ввести му...,"1, 5, 12, 13, 14",1,0,0
...,...,...,...,...,...
6990,"Печенку посолить, поперчить, посыпать специями...","30, 5, 151",0,0,0
6991,Крабовые палочки мелко нарезать или порубить в...,"17, 140, 106",0,0,0
6992,"Чечевицу промыть, залить водой на 4 см выше ур...","37, 76",0,0,0
6993,Нарезать филе средними полосками. Морковь наре...,"17, 67, 139",0,0,0


In [193]:
from nltk.corpus import stopwords

#nltk.download('stopwords')
stopwrds = set(stopwords.words('russian'))

In [194]:
import pymorphy2
import re

morph = pymorphy2.MorphAnalyzer()

In [195]:
def remove_stopwords(stroka, stops):
    stroka = re.sub(r'[^\w\s]','', stroka)
    stroka = stroka.lower().split()
    filt_text = ""
    for w in stroka:
        if w not in stops:
            filt_text += w + " "
    return filt_text

In [196]:
def text_normalization(text):
    filt_text = []

    for w in text:
        w_norm = morph.parse(w)[0]
        try:
            part_of_speech = w_norm.tag.POS
            w_norm = w_norm.normal_form+"_"+part_of_speech
        except TypeError as error:
            filt_text.append(w_norm.normal_form+"_UNKN")
        if w_norm not in filt_text:
            filt_text.append(w_norm)

    return filt_text

In [197]:
corpus = loaded_data['manual'].apply(lambda x: remove_stopwords(x, stopwrds))

In [198]:
print(corpus)

0       шоколад разломать кусочки вместе сливочным мас...
1       положите весь творог кастрюльку разомните вилк...
2       вскипятите воду большой кастрюле сварите пасту...
3       разогреть духовку отделить белки желтков белки...
4       взбить яйца сахаром постепенно ввести муку сол...
                              ...                        
6990    печенку посолить поперчить посыпать специями д...
6991    крабовые палочки мелко нарезать порубить бленд...
6992    чечевицу промыть залить водой 4 см выше уровня...
6993    нарезать филе средними полосками морковь нарез...
6994    очищенный вымытый картофель нарезать кубиками ...
Name: manual, Length: 6995, dtype: object


In [199]:
#corpus = loaded_data['manual'].apply(lambda x:nt.TextExtractor(x).extract_stopwords(stopwrds))

In [200]:
tfidf = TfidfVectorizer()

Xfeatures = tfidf.fit_transform(corpus).toarray()


In [219]:
Xfeatures

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [202]:
y = loaded_data[ids_cat]

In [203]:
x_train,x_test,y_train,y_test = train_test_split(Xfeatures,y,test_size=0.3,random_state=42)

In [204]:
from sklearn.preprocessing import StandardScaler

#не подходит
#scaler = StandardScaler().fit(x_train)
#x_train = scaler.transform(x_train)

In [220]:
print(x_train)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [206]:
from sklearn.preprocessing import Normalizer

scaler = Normalizer().fit(x_train)
x_train = scaler.transform(x_train)

In [221]:
'''from sklearn.preprocessing import Binarizer

binalizer = Binarizer(threshold=0.0).fit(x_train)
binary_x = binalizer.transform(x_train)'''

In [222]:
print(x_train)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [208]:
binary_rel_clf = BinaryRelevance(MultinomialNB()) #our model

In [209]:
binary_rel_clf.fit(x_train,y_train)

In [210]:
br_prediction = binary_rel_clf.predict(x_test)

In [211]:
br_prediction.toarray()

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       ...,
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]], dtype=int64)

In [216]:
accuracy_score(y_test,br_prediction)

0.845164363982849

In [232]:
ex1 = 'Выпечка и брауни'

vec_example = tfidf.transform([ex1])

binary_rel_clf.predict(vec_example).toarray()

array([[1, 0, 0]], dtype=int64)