In [99]:
import numpy as np
import plotly.express as px
import pandas as pd
import plotly.io as pio
from tqdm.auto import tqdm

pio.templates.default = "plotly_dark"

In [100]:
from src.product import Product
from json import load

with open('agora_hack_products.json', encoding='utf-8') as file:
    all_products = load(file)

all_products = [Product(**p) for p in all_products]

In [101]:
references = [p for p in all_products if p.is_reference]
references_id_set = set([ref.product_id for ref in references])
products = [p for p in all_products if p.product_id not in references_id_set]

In [102]:
from sklearn.model_selection import train_test_split

# удалим часть эталонов, что бы в датасете были 'ничейные' товары
references, nulled_references = train_test_split(
    references, test_size=0.1, random_state=42
)

nulled_references_set = set([r.product_id for r in nulled_references])
for p in products:
    if p.reference_id in nulled_references_set:
        p.reference_id = None
for p in nulled_references:
    p.is_reference = False

products.extend(nulled_references)

products_train, products_test = train_test_split(
    products, test_size=0.5, random_state=42
)
products_train.extend(references)

In [117]:
from json import dump

def product2dict(p):
    return p.__dict__

def product2req(p):
    return {
        'id': p.product_id,
        'name': p.name,
        'props': p.props,
    }

def product2ans(p):
    return {
        'id': p.product_id,
        'reference_id': p.reference_id,
    }

with open('server_test_train.json', 'w', encoding='utf-8') as f:
    dump(list(map(product2dict, products_train)), f, ensure_ascii=False)
with open('server_test_target.json', 'w', encoding='utf-8') as f:
    dump(list(map(product2dict, products_test)), f, ensure_ascii=False)


#with open('server_test_test.json', 'w') as f:
#    dump(list(map(product2req, products_test)), f)
#with open('server_test_ans.json', 'w') as f:
#    dump(list(map(product2ans, products_test)), f)

In [103]:
def accuracy(predicted, target):
    return list(map(lambda v: v[0] == v[1], zip(predicted, target))).count(True) / len(predicted)

In [104]:
import random
random.seed(0)
import copy

def augment_train(prods: [Product], count=2):
    res = []
    for p in prods:
        for _ in range(count):
            p_ = copy.deepcopy(p)
            p_.props = random.sample(p_.props, random.randint(len(p_.props) // 2, len(p_.props)))
            name_parts = p_.name.split()
            p_.name = ' '.join(random.sample(name_parts, random.randint(len(name_parts) // 2, len(name_parts))))

            res.append(p_)

    prods.extend(res)
    random.shuffle(prods)

In [105]:
augment_train(all_products)
len(all_products)

9753

In [106]:
hand_test = []
for i in range(3):
    with open(f'test_data/data_{i}.json', encoding='utf-8') as f:
        hand_test.extend(load(f))

hand_test = [Product(**p) for p in hand_test]

In [107]:
from src.model import ProductMatchingModel
from sklearn.svm import OneClassSVM, LinearSVC, SVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from spacy.lang.en import STOP_WORDS as EN_STOP_WORDS
from spacy.lang.ru import STOP_WORDS as RU_STOP_WORDS

model = ProductMatchingModel(RidgeClassifier(alpha=0.4),
                             RidgeClassifier(alpha=0.1),
                             RidgeClassifier(alpha=0.1),
                             CountVectorizer(stop_words=RU_STOP_WORDS or EN_STOP_WORDS))

model.fit(all_products)

def brut_ml(threshold):

    return accuracy(model.predict(all_products, threshold), map(lambda p: p.reference_id, all_products)),\
    accuracy(model.predict(products_test, threshold), map(lambda p: p.reference_id, products_test)),\
    accuracy(model.predict(hand_test, threshold), map(lambda p: p.reference_id, hand_test))

In [108]:
df = []
for th in tqdm(np.arange(5, 60, 3)):
    df.append([th, *brut_ml(th)])

df = pd.DataFrame(df, columns=['threshold', 'train', 'test', 'val'])

  0%|          | 0/19 [00:00<?, ?it/s]

In [109]:
px.line(df, x='threshold', y=['train', 'test', 'val'], )

In [110]:
def classify_error(pred, targ):
    if pred == targ:
        return
    if pred is None and targ is not None:
        return 'freerider'
    if pred is not None and targ is None:
        return 'bastard'
    return 'chameleon'


In [111]:
from collections import Counter


results = list(zip(model.predict(hand_test), map(lambda p: p.reference_id, hand_test)))
cnt = Counter([classify_error(a, b) for a, b in results if a != b])
print('acc=', 1 - sum(cnt.values()) / len(results))
cnt

TypeError: predict() missing 1 required positional argument: 'threshold'

In [None]:
for r, product_given in zip(results, hand_test):
        predicted, target = r
    #if predicted != target:
        print('\n\n<=====================>\n')

        print(product_given)

        print('\n matched with \n')

        print(*filter(lambda p: p.product_id == predicted, references))

In [None]:
0/0

In [None]:
from sklearn.gaussian_process.kernels import RBF
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import RidgeClassifier, SGDClassifier
from sklearn.base import ClassifierMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.base import clone

from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en import STOP_WORDS as EN_STOP_WORDS
from spacy.lang.ru import STOP_WORDS as RU_STOP_WORDS

import pandas as pd
import plotly.express as px

from model import ProductMatchingModel

use_models = [
    KNeighborsClassifier(3),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
]

models = list(zip(map(lambda m: m.__class__.__name__, use_models), use_models)) # now its [(name, model), ...]

model_tests_df = []
for name, model in models:
    master_model = ProductMatchingModel(RidgeClassifier(alpha=1.9), clone(model), clone(model),
                                        vectorizer=TfidfVectorizer(stop_words=RU_STOP_WORDS or EN_STOP_WORDS))
    master_model.fit(all_products)
    train_acc, test_acc, hand_acc = accuracy(master_model.predict(all_products), map(lambda p: p.reference_id, all_products)),\
                          accuracy(master_model.predict(products_test), map(lambda p: p.reference_id, products_test)),\
                          accuracy(master_model.predict(hand_test), map(lambda p: p.reference_id, hand_test)),

    print(name, f"train: {train_acc}, test: {test_acc}, hand: {hand_acc}")
    model_tests_df.append([name, train_acc, test_acc, hand_acc])

model_tests_df = pd.DataFrame(model_tests_df, columns=['name', 'train_acc', 'test_acc', 'hand_acc'])

bar = px.bar(model_tests_df, x='name', y=['train_acc', 'test_acc', 'hand_acc'], barmode='group', log_y=False)
bar.show()