In [95]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
from tqdm.auto import tqdm

pio.templates.default = "plotly_dark"


from product import Product

### Чтение датасета и предобработка

In [96]:
from json import load

with open('agora_hack_products.json', encoding='utf-8') as file:
    all_products = load(file)

In [97]:
all_products = [Product(**p) for p in all_products]

In [98]:
references = [p for p in all_products if p.is_reference]
references_id_set = set([ref.product_id for ref in references])
products = [p for p in all_products if p.product_id not in references_id_set]

### Посмотрим на данные

In [99]:
def get_referrers(ref: Product):
    assert ref.is_reference
    return [p for p in products if p.reference_id == ref.product_id]  # TODO: use filter


def describe_reference(ref: Product):
    print(ref)
    print()
    print(*get_referrers(ref), sep='\n')

In [100]:
describe_reference(references[11])

Reference Микроволновая печь LG MS2535GIS, черный (070dd48327cad4d5): 
 ['Объем\t25 л', 'Инверторное управление мощностью\tда', 'Внутреннее покрытие камеры\tэмаль', 'Переключатели\tсенсорные', 'Мощность  микроволн\t1000 Вт', 'Диаметр  поддона\t292 мм', 'Режимы   работы\tразморозка', 'ШхВхГ\t47.60х27.20х36.80   см', 'Особенности   подсветка камеры, дисплей, блокировка от детей', 'Доп.  режимы автоматический разогрев, программирование процесса приготовления, звуковой сигнал отключения, автоматическое поддержание температуры, автоматическое приготовление, автоматическая разморозка']

Referrer of 070dd48327cad4d5 Микроволновая печь LG MS2535GIS черный (инвертор) (41b312152ddb3af7): 
 ['Переключатели\tсенсорные', 'Внутреннее   покрытие камеры\tэмаль', 'Диаметр  поддона\t292 мм', 'Режимы  работы\tразморозка', 'ШхВхГ\t47.60х27.20х36.80  см', 'Особенности  подсветка камеры, дисплей, блокировка от детей', 'Доп.  режимы звуковой сигнал отключения, автоматическая разморозка, автоматическое поддер

### Разделим на тестовый и тренировочный сеты

In [101]:
from sklearn.model_selection import train_test_split

# удалим часть эталонов, что бы в датасете были 'ничейные' товары
references, nulled_references = train_test_split(
    references, test_size=0.3, random_state=42
)

nulled_references_set = set([r.product_id for r in nulled_references])
for p in products:
    if p.reference_id in nulled_references_set:
        p.reference_id = "null"
references.append(Product(product_id="null",
                          name="null",
                          props=[],
                          is_reference=True,
                          reference_id=""))

products_train, products_test = train_test_split(
    products, test_size=0.4, random_state=42
)
all_products = products_train + references

### Для представления текста будем использовать подсчет слов (токенов)

In [102]:
# альтернативный токенайзер из nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


#nltk.download('punkt')
#nltk.download('stopwords')

def get_product_payload(product: Product) -> str:
    return product.name + ' ' + ' '.join(product.props)


def tokenize(payload: str, language='russian') -> [str]:
    words = word_tokenize(payload, language=language)
    words = list(map(lambda w: w.lower(), words))
    words = list(filter(lambda w: w not in stopwords.words(language), words))

    return words

In [103]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from spacy.lang.ru import STOP_WORDS as RU_STOP_WORDS
from spacy.lang.en import STOP_WORDS as EN_STOP_WORDS


def products2corpus(prods):
    return [p.name + ' ' + ' '.join(p.props) for p in prods]


train_corpus = products2corpus(all_products)
test_corpus = products2corpus(products_test)

tfidf = TfidfVectorizer(
    stop_words=RU_STOP_WORDS or EN_STOP_WORDS,
    #tokenizer=tokenize
)

X_train = tfidf.fit_transform(train_corpus).toarray()

X_test = tfidf.transform(test_corpus).toarray()

### Построим таргет сет, номером класса будет индекс эталона в изначальном сете

In [104]:
ref_id2ind = {
    ref.product_id: i for i, ref in enumerate(references)
}


def build_target(prods):
    return np.array([
        ref_id2ind[p.product_id if p.is_reference else p.reference_id] for p in prods
    ])

y_ref_train = build_target(all_products)

y_ref_test = build_target(products_test)


def build_target_unknowns(prods):
    return np.array(list(
        map(lambda p: 1 if p.reference_id == 'null' else 0, prods)
    ))

y_unk_train = build_target_unknowns(all_products)

y_unk_test = build_target_unknowns(products_test)

### Попробуем разные модели

In [105]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import RidgeClassifier, SGDClassifier
from sklearn.base import ClassifierMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

use_models = [

    RandomForestClassifier(random_state=0),
    LinearSVC(max_iter=4000),
    MultinomialNB(),
    LogisticRegression(random_state=0, max_iter=400),
    RidgeClassifier(normalize=True),
    SGDClassifier(warm_start=True),
    DecisionTreeClassifier(),
    #GradientBoostingClassifier() # too long
]

def brut_ml(X_train, y_train, X_test, y_test):
    models = list(zip(map(lambda m: m.__class__.__name__, use_models), use_models)) # now its [(name, model), ...]

    vote = VotingClassifier(estimators=models, voting='hard', n_jobs=10)
    vote.fit(X_train, y_train)
    models = [(info[0], vote.estimators_[i]) for i, info in enumerate(models)]
    vote.weights = list(map(lambda info: info[1].score(X_train, y_train) ** 3, models))

    models.append(('vote', vote))


    def test_model(model: ClassifierMixin):
        return model.score(X_train, y_train), model.score(X_test, y_test)


    model_tests_df = []
    for name, model in models:
        train_acc, test_acc = test_model(model)
        print(name, f"train: {train_acc}, test: {test_acc}")
        model_tests_df.append([name, train_acc, test_acc])

    model_tests_df = pd.DataFrame(model_tests_df, columns=['name', 'train_acc', 'test_acc'])

    bar = px.bar(model_tests_df, x='name', y=['train_acc', 'test_acc'], barmode='group', log_y=True)
    bar.show()

In [106]:
brut_ml(X_train, y_unk_train, X_test, y_unk_test)

RandomForestClassifier train: 1.0, test: 0.8408273381294964
LinearSVC train: 0.9924924924924925, test: 0.8938848920863309
MultinomialNB train: 0.8263263263263263, test: 0.7553956834532374
LogisticRegression train: 0.8358358358358359, test: 0.7526978417266187
RidgeClassifier train: 0.996996996996997, test: 0.9334532374100719
SGDClassifier train: 0.994994994994995, test: 0.9199640287769785
DecisionTreeClassifier train: 1.0, test: 0.8651079136690647
vote train: 0.9944944944944945, test: 0.8884892086330936


In [107]:
brut_ml(X_train, y_ref_train, X_test, y_ref_test)

RandomForestClassifier train: 0.998998998998999, test: 0.6960431654676259
LinearSVC train: 0.998998998998999, test: 0.9073741007194245
MultinomialNB train: 0.26126126126126126, test: 0.3147482014388489
LogisticRegression train: 0.3813813813813814, test: 0.38039568345323743
RidgeClassifier train: 0.9954954954954955, test: 0.9307553956834532
SGDClassifier train: 0.9944944944944945, test: 0.8696043165467626
DecisionTreeClassifier train: 0.998998998998999, test: 0.689748201438849
vote train: 0.998998998998999, test: 0.89568345323741


In [108]:

from sklearn.model_selection import GridSearchCV

'''
'''

grid = GridSearchCV(
                    scoring='accuracy',
                    n_jobs=12)

grid.fit(
    np.concatenate([X_train, X_test]),
    np.concatenate([y_ref_train, y_ref_test])
)

print(grid.best_params_, grid.best_score_)

'''
    RidgeClassifier(),
                    {
                        'alpha': np.arange(1.0, 2.5, 0.3)#[0.1, 1.0, 2.0, 3.0, 4.0, 10.0],
                    },

    SGDClassifier(),
                    {
                        'loss': ['hinge', 'log_loss', 'log', 'modified_huber', 'squared_hinge',
                                 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive',
                                 'squared_epsilon_insensitive'],
                        'penalty': ['l2', 'l1', 'elasticnet'],
                        'alpha': [1e-3, 1e-4]
                    },

    LinearSVC(max_iter=4000),
                    {
                        'penalty': ['l1', 'l2'],
                        'loss': ['hinge', 'squared_hinge'],
                        'C': np.arange(0, 1, 0.1),

                    },
'''

TypeError: __init__() missing 2 required positional arguments: 'estimator' and 'param_grid'

## Best models so far

In [None]:
LinearSVC(C=0.9)
RidgeClassifier(alpha=1.9)