# Import libs

In [1]:
!pip install git+https://github.com/Desklop/Uk_Stemmer

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/Desklop/Uk_Stemmer
  Cloning https://github.com/Desklop/Uk_Stemmer to c:\users\eyggen\appdata\local\temp\pip-req-build-ujodk5m4
  Resolved https://github.com/Desklop/Uk_Stemmer to commit a700ae1bd9b69ad84d311d089e8bc95ab7fab44d


  Running command git clone -q https://github.com/Desklop/Uk_Stemmer 'C:\Users\Eyggen\AppData\Local\Temp\pip-req-build-ujodk5m4'


In [118]:
#import general libs
import pandas as pd
import csv
import nltk
import string
import re
from uk_stemmer import UkStemmer
from nltk.stem import SnowballStemmer
import numpy as np

#import libs for train our models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

#import models libs
from sklearn.naive_bayes import GaussianNB, MultinomialNB 
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Create dataset

In [74]:
train_df = pd.read_excel("train.xlsx")
test_df = pd.read_excel("test.xlsx")

In [75]:
x_train = train_df["title"]
y_train = train_df["index"]

x_test = test_df["title"]
y_test = test_df["index"]

### Create a vectorizer

In [76]:
snowball = SnowballStemmer(language='russian')
stemmer = UkStemmer()
test_x = {"index":0,"title":""}
test_x_ru = []
test_x_ua = []

def word_preprop(sentense):
    prepare_test_string = sentense.lower()
    words = re.split(r'(\W)', prepare_test_string)
    words = [word for word in words if word != ' ']
    return words

def tokenize_sentense(sentense):
    words = word_preprop(sentense)
    words = [snowball.stem(i) for i in words]
    words = [stemmer.stem_word(i) for i in words]
    return(words)
    

In [77]:
vectorizer = TfidfVectorizer(tokenizer=lambda x: tokenize_sentense(x))
features = vectorizer.fit_transform(train_df["title"])

In [78]:
def get_class_name(model, class_title):
    class_names = {
        0:"Комп'ютерна техніка",
        1:"Сматрфони і ТВ",
        2:"Інструменти та автотовари",
        3:"Сантехніка та ремонт",
        4:"Побутова техніка",
        5:"Товари для геймерів",
        6:"Товари для дому",
        7:"Спорт та захоплення",
        8:"Офіс, школа, книги",
        9:"Одяг, взуття та прикраси",
        10:"Краса та здоров'я",
        11:"Зоотовари",
        12:"Дитячі товари",
        13:"Дача, сад та товари для дому",
        14:"Алкогольні напої, сигарети",
        15:"Продукти харчування",
    }
    return class_names[model.predict([class_title])[0]]

## LogisticRegression

In [79]:
LogisticRegression_model_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentense(x))),
    ("model", LogisticRegression(random_state=0))
])

In [80]:
LogisticRegression_model_pipeline.fit(x_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x000002149FD1BA60>)),
                ('model', LogisticRegression(random_state=0))])

In [81]:
get_class_name(LogisticRegression_model_pipeline, "Одяг")

'Одяг, взуття та прикраси'

In [82]:
LogisticRegression_prediction = LogisticRegression_model_pipeline.predict(test_df["title"])
accuracy_score(LogisticRegression_prediction, test_df["index"])

0.47619047619047616

## SVC 

In [142]:
SVC_model_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentense(x))),
    ("model", SVC(gamma=1, random_state=0, probability=True))
])

In [143]:
SVC_model_pipeline.fit(train_df["title"], train_df["index"])

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x00000214A8615550>)),
                ('model', SVC(gamma=1, probability=True, random_state=0))])

In [144]:
SVC_prediction = SVC_model_pipeline.predict(test_df["title"])
accuracy_score(SVC_prediction, test_df["index"])

0.5952380952380952

## KNN

In [113]:
KNN_model_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentense(x))),
    ("model", KNeighborsClassifier(n_neighbors=1))
])


In [114]:
KNN_model_pipeline.fit(train_df["title"], train_df["index"])

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x00000214A15C5670>)),
                ('model', KNeighborsClassifier(n_neighbors=1))])

In [115]:
KNN_prediction = KNN_model_pipeline.predict(test_df["title"])
accuracy_score(KNN_prediction, test_df["index"])

0.6190476190476191

## Naive bayes

In [89]:
GaussianNB_model_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentense(x))),
    ("model", MultinomialNB())
])

In [90]:
GaussianNB_model_pipeline.fit(train_df["title"], train_df["index"])

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x000002149FD1B9D0>)),
                ('model', MultinomialNB())])

In [91]:
GaussianNB_prediction = GaussianNB_model_pipeline.predict(test_df["title"])
accuracy_score(GaussianNB_prediction, test_df["index"])

0.38095238095238093

## RandomForestClassifier

In [92]:
RandomForestClassifier_model_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentense(x))),
    ("model", RandomForestClassifier(random_state=0))
])

In [93]:
RandomForestClassifier_model_pipeline.fit(train_df["title"], train_df["index"])

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x000002149FCE20D0>)),
                ('model', RandomForestClassifier(random_state=0))])

In [94]:
RandomForestClassifier_prediction = RandomForestClassifier_model_pipeline.predict(test_df["title"])
accuracy_score(RandomForestClassifier_prediction, test_df["index"])

0.5714285714285714

## DecisionTreeClassifier

In [95]:
DecisionTreeClassifier_model_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentense(x))),
    ("model", DecisionTreeClassifier(random_state=0))
])

In [96]:
DecisionTreeClassifier_model_pipeline.fit(train_df["title"], train_df["index"])

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x000002149FD1BB80>)),
                ('model', DecisionTreeClassifier(random_state=0))])

In [97]:
DecisionTreeClassifier_prediction = DecisionTreeClassifier_model_pipeline.predict(test_df["title"])
accuracy_score(DecisionTreeClassifier_prediction, test_df["index"])

0.5714285714285714

In [131]:
proc = KNN_model_pipeline.predict_proba(["Принтер"])[0]
kef = np.argmax(proc)
value = proc[kef]
print(len(proc))
print(kef)
print(value)

17
0
1.0


In [145]:
while True:
    inpt = input("Введіть річ: ")
    if inpt == "+":
        break
    model = SVC_model_pipeline
    proc = model.predict_proba([inpt])[0]
    kef = np.argmax(proc)
    value = proc[kef]
    print(proc)
    print(kef)
    print(value)
    print("\n" + "#"*20)
    if value >= 0.65:
        print(get_class_name(model, inpt))
    else:
        print("Не знаю")

Введіть річ: Килим
[0.06072457 0.07914208 0.0948688  0.07068444 0.04240576 0.02420091
 0.07822795 0.10794531 0.11340753 0.101434   0.03492644 0.03782962
 0.04358511 0.03190642 0.02887787 0.04780174 0.00203146]
8
0.11340752818991928

####################
Не знаю
Введіть річ: Молоко
[0.0067015  0.00762016 0.01017935 0.00697554 0.00528608 0.00738725
 0.00838022 0.00931855 0.00969829 0.00880912 0.00415799 0.005804
 0.00489162 0.00551709 0.00709439 0.89126893 0.00090992]
15
0.8912689327353319

####################
Продукти харчування
Введіть річ: Принтер
[0.03420982 0.00764286 0.01590381 0.00455665 0.00367774 0.00686829
 0.00622223 0.00603125 0.87995868 0.00546669 0.0027134  0.00310615
 0.01285293 0.00353601 0.00300583 0.00325871 0.00098895]
8
0.8799586819287225

####################
Офіс, школа, книги
Введіть річ: Мишка
[0.46759395 0.06858278 0.06869584 0.02932915 0.02174321 0.05622442
 0.03604884 0.04348845 0.04535008 0.03904774 0.01613027 0.020911
 0.02436575 0.02167408 0.01599594 0.0215