In [None]:
import pandas as pd

with open('./product_train_0.txt', 'r' , encoding='utf-8') as file:
    data = file.read().split('\n')
    test_df = pd.DataFrame([line.split(' ',1) for line in data], columns=['Label', 'Description'])
test_df = test_df.drop(test_df.index[-1])
X_test, y_test = test_df['Description'], test_df['Label']
train_df_1 = test_df.copy()
labels = train_df_1['Label'].unique()
for label in labels:
    train_df_1.loc[len(train_df_1)] = [label, label.replace('__label__', '').replace('_', ' ')]
train_df_1.drop_duplicates(keep='first')
train_df_1.describe()

Unnamed: 0,Label,Description
count,16947,16947
unique,1971,16921
top,"__label__Bao_đựng,_ốp_lưng_điện_thoại",Giấy dán nhãn Tomy A4
freq,343,2


In [None]:
train_df_2 = pd.read_csv('./crawl_res_2.csv')
train_df_2['Label'] = train_df_2['Label'].apply(lambda x: f'__label__{x.replace(" ", "_")}')
train_df_2.drop_duplicates(keep='first')
train_df_2 = train_df_2.groupby('Label').apply(lambda x: x.sample(10)).reset_index(drop=True)
train_df_2.describe()

Unnamed: 0,Label,Description
count,19710,19710
unique,1971,18661
top,__label__Access_point_(Wifi),Miếng Tấm Pát Pad Thép Vuông Sửa Bản Lề Tủ Cửa...
freq,10,5


In [None]:
train_df_3 = pd.read_csv('./crawl_res_3.csv')
train_df_3['Label'] = train_df_3['Label'].apply(lambda x: f'__label__{x.replace(" ", "_")}')
train_df_3.drop_duplicates(keep='first')
train_df_3 = train_df_3.groupby('Label').apply(lambda x: x.sample(min(5, len(x)))).reset_index(drop=True)
train_df_3.describe()

Unnamed: 0,Label,Description
count,9848,9848
unique,1971,7664
top,__label__Access_point_(Wifi),"Bộ Điều Khiển- H2T Group, Máy Đo Độ Ẩm Vải"
freq,5,30


In [None]:
train_df = pd.concat([train_df_1, train_df_2, train_df_3])
X_train, y_train = train_df['Description'], train_df['Label']
train_df.describe()

Unnamed: 0,Label,Description
count,46505,46505
unique,1971,43236
top,"__label__Bao_đựng,_ốp_lưng_điện_thoại","Bộ Điều Khiển- H2T Group, Máy Đo Độ Ẩm Vải"
freq,358,30


In [None]:
import numpy as np
import re

# Define a function to remove CJK (Chinese, Japanese, Korean) characters
def remove_cjk_characters(text):
    cjk_pattern = re.compile(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]')
    return ''.join(char for char in text if not cjk_pattern.search(char))

def remove_unwanted_char(df):
    # Remove cjk characters
    df['Description'] = df['Description'].apply(remove_cjk_characters)
    # Remove non-word characters or words containing numbers which are often product codes, product specifications, etc.
    df['Description'] = df['Description'].str.replace(r'(\S*\d\S*|[^\w\s]|\_+)', ' ', regex=True)
    # Remove words containing only consonants
    df['Description'] = df['Description'].str.replace(r'\b[bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ]+\b', ' ', regex=True)
    # Replace continuous spaces into one space
    df['Description'] = df['Description'].str.replace(r'\s+', ' ', regex=True).str.strip()

    # Replace the empty 'Description' rows due to all words removed into NaN
    df = df.replace(r'^\s*$', np.nan, regex=True)

remove_unwanted_char(train_df)
train_df.drop_duplicates(keep='first')
train_df.describe()

Unnamed: 0,Label,Description
count,46505,46505
unique,1971,40026
top,"__label__Bao_đựng,_ốp_lưng_điện_thoại",Bộ Điều Khiển Group Máy Đo Độ Ẩm Vải
freq,358,132


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Split the data set into matrix of features X and vector of labels y
X_train, y_train = train_df['Description'], train_df['Label']

tfidf_vect = TfidfVectorizer(analyzer='word', max_features=15000)

X_tfidf_train = tfidf_vect.fit_transform(X_train)

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y_train_n = encoder.fit_transform(y_train)

In [None]:
# Clean test data by removing unwanted characters and duplicated records
remove_unwanted_char(test_df)

# Tokenize Vietnamese words
# vi_tokenizer(test_df)
test_df.describe()

Unnamed: 0,Label,Description
count,14976,14976
unique,1971,12551
top,"__label__Bao_đựng,_ốp_lưng_điện_thoại",Tranh đá
freq,342,45


In [None]:
# Split dataframe into X_test and y_test
X_test, y_test = test_df['Description'], test_df['Label']

# Feature extraction
X_tfidf_test = tfidf_vect.transform(X_test)

# Label encoding
y_test_n = encoder.transform(y_test)

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from joblib import Parallel, delayed

def train_on_subset(classifier, X_subset, y_subset):
    X_tfidf_subset = tfidf_vect.transform(X_subset)
    classifier.fit(X_tfidf_subset, y_subset)
    print('Train complete')
    return classifier

def predict_bagging(clfs, X_test):
    # Get the probabilities for each labels of each sample in X_test and put into a dataframe
    probas = pd.concat([pd.DataFrame(model.predict_proba(X_test), columns = model.classes_) for model in clfs], axis=1)
    # Get means of proba for duplicated col in subsets
    probas = probas.groupby(level=0, axis=1).mean()
    # Get the label with highest value
    y_pred = probas.idxmax(axis=1)
    return y_pred

# Define function to train and evaluate model
def train_model_bagging(classifier, X_train, y_train, X_test, y_test, svd=False):
    # split the data into num_subsets of subsets
    num_subsets = 8
    X_subsets = np.array_split(X_train, num_subsets)
    y_subsets = np.array_split(y_train, num_subsets)

    # Train models for each subset of data
    classifiers = Parallel(n_jobs=2)(delayed(train_on_subset)(classifier, X_subsets[i], y_subsets[i]) for i in range(num_subsets))

    y_pred = predict_bagging(classifiers, X_test)

    print(f"""
Model {classifier.__class__.__name__} with {"svd" if svd else "no svd"}
    Accuracy: {accuracy_score(y_test, y_pred)}
    Macro F1: {f1_score(y_test, y_pred, average='macro')}
====================================================
    """)

    return y_pred, classifiers

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Because rf train without max_depth return in 900-deep trees
rf = RandomForestClassifier(n_estimators=50, random_state=42, max_features='sqrt', max_depth=64)
y_pred, rfs = train_model_bagging(rf, X_train, y_train_n, X_tfidf_test, y_test_n)


Model RandomForestClassifier with no svd
    Accuracy: 0.7355769230769231
    Macro F1: 0.49079457890374845    
    


In [None]:
import os
import pickle
from datetime import datetime

# Define a function to save trained model for future prediction
def save_model(model, i):
    if not os.path.exists('data'):
        os.makedirs('data')
    with open(f'data/model{model.__class__.__name__}{i}.pkl', 'wb') as files:
        pickle.dump(model, files)

for i in range(len(rfs)):
    save_model(rfs[i], i)

In [None]:
with open('./product_test_unlabeled.txt', 'r', encoding='utf-8') as file:
    X = file.read().split('\n')
    res = pd.DataFrame({'Description': X})
remove_unwanted_char(res)
X_valid = res['Description']

# X_tfidf_valid = tfidf_vect.transform(X_valid)

def batch_generator(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i+batch_size]

y_list = []
for batch in batch_generator(X_valid, batch_size=1000):
    batch_tfidf = tfidf_vect.transform(batch)
    y_batch = predict_bagging(rfs, batch_tfidf)
    y_list.append(y_batch)
y_valid_n = pd.concat(y_list)
res.insert(0, 'Label', encoder.inverse_transform(y_valid_n))
res['Description'] = X

In [None]:
res

Unnamed: 0,Label,Description
0,__label__Chìa_lục_giác,Khuôn Lục Giác
1,__label__Tranh_gạch,Tranh gạch 3d phong cảnh thiên nhiên Pita 04
2,__label__Ghế_cafe_(_café),Ghế Nhựa Rosa
3,__label__Xe_quét_rác,Xe quét rác hút bụi Sweepmaster D1200
4,__label__Đầu_ghi_hình_cho_camera_giám_sát,Đầu ghi Goldeye AVR7104
...,...,...
134786,__label__Giày_thể_thao_nam,Giày Nike Zoom Pegasus 34 xám
134787,__label__Nước_hoa_nữ,Bodysuit hoa Elfindoll size 70
134788,"__label__Bao_đựng,_ốp_lưng_điện_thoại",Ốp lưng iphone 6 Plus ( 1807 )
134789,__label__Pallet,Pallet liền khối HITA 20 ( HPL20 - LK )


In [None]:
with open('./res.txt', 'w', encoding='utf-8') as file:
    for row in res.itertuples(index=False):
        file.write(' '.join(map(str, row)) + '\n')