In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing as pre
from scipy.sparse import csr_matrix
import scipy as sp
from tqdm import tqdm_notebook as tqdm
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

%load_ext nb_black
%matplotlib inline

<IPython.core.display.Javascript object>

In [2]:
df_train = pd.read_csv("train.csv").set_index("item_id")
df_test = pd.read_csv("test.csv").set_index("item_id")
df_category = pd.read_csv("category.csv")
df_train.head(5)

Unnamed: 0_level_0,title,description,price,category_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Картина,Гобелен. Размеры 139х84см.,1000.0,19
1,Стулья из прессованной кожи,Продам недорого 4 стула из светлой прессованно...,1250.0,22
2,Домашняя мини баня,"Мини баня МБ-1(мини сауна), предназначена для ...",13000.0,37
3,"Эксклюзивная коллекция книг ""Трансаэро"" + подарок","Продам эксклюзивную коллекцию книг, выпущенную...",4000.0,43
4,Ноутбук aser,Продаётся ноутбук ACER e5-511C2TA. Куплен в ко...,19000.0,1


<IPython.core.display.Javascript object>

In [25]:
y = df_train.category_id

X_train_title = df_train.title
X_train_desc = df_train.description
X_train_num = pre.normalize(pre.scale(df_train.price.values.reshape(-1, 1)))

X_test_title = df_test.title
X_test_desc = df_test.description
X_test_num = pre.normalize(pre.scale(df_test.price.values.reshape(-1, 1)))

<IPython.core.display.Javascript object>

In [6]:
mystem = Mystem()
russian_stopwords = stopwords.words("russian")


def preprocess_text(text):
    temp = []
    for i in text:
        tokens = mystem.lemmatize(str(i.lower()))
        tokens = [
            token
            for token in tokens
            if token not in russian_stopwords
            and token != " "
            and token.strip() not in punctuation
        ]

        i = " ".join(tokens)
        temp.append(i)
    return temp

<IPython.core.display.Javascript object>

In [7]:
%%time
train_clean_title = preprocess_text(tqdm(X_train_title))
train_clean_desc = preprocess_text(tqdm(X_train_desc))
test_clean_title = preprocess_text(tqdm(X_test_title))
test_clean_desc = preprocess_text(tqdm(X_test_desc))

HBox(children=(IntProgress(value=0, max=489517), HTML(value='')))




HBox(children=(IntProgress(value=0, max=489517), HTML(value='')))




HBox(children=(IntProgress(value=0, max=243166), HTML(value='')))




HBox(children=(IntProgress(value=0, max=243166), HTML(value='')))


CPU times: user 6min 18s, sys: 33.2 s, total: 6min 51s
Wall time: 34min 58s


<IPython.core.display.Javascript object>

In [8]:
from sklearn.feature_extraction.text import (
    TfidfTransformer,
    CountVectorizer,
    TfidfVectorizer,
)
from sklearn.linear_model import SGDClassifier, LogisticRegression, SGDRegressor
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

<IPython.core.display.Javascript object>

In [16]:
vectorizer = TfidfVectorizer()

<IPython.core.display.Javascript object>

In [17]:
train_vec_title = vectorizer.fit_transform(train_clean_title)
train_vec_desc = vectorizer.transform(train_clean_desc)

test_vec_title = vectorizer.transform(test_clean_title)
test_vec_desc = vectorizer.transform(test_clean_desc)

<IPython.core.display.Javascript object>

In [26]:
(text_train_desc, text_val_desc) = train_test_split(
    train_vec_desc, test_size=0.3, random_state=0
)
(text_train_title, text_val_title) = train_test_split(
    train_vec_title, test_size=0.3, random_state=0
)
(num_train, num_val) = train_test_split(X_train_num, test_size=0.3, random_state=0)

(y_train, y_val) = train_test_split(y, test_size=0.3, random_state=0)

<IPython.core.display.Javascript object>

In [19]:
clean_text_train = sp.sparse.hstack((text_train_title, text_train_desc))
X_prepared_train = pre.normalize(
    sp.sparse.hstack((clean_text_train, csr_matrix(num_train)))
)

<IPython.core.display.Javascript object>

In [20]:
clean_text_val = sp.sparse.hstack((text_val_title, text_val_desc))
X_prepared_val = pre.normalize(sp.sparse.hstack((clean_text_val, csr_matrix(num_val))))

<IPython.core.display.Javascript object>

In [21]:
clean_text_test = sp.sparse.hstack((test_vec_title, test_vec_desc))
X_prepared_test = pre.normalize(
    sp.sparse.hstack((clean_text_test, csr_matrix(X_test_num)))
)

<IPython.core.display.Javascript object>

In [27]:
%%time
for clf in [LinearSVC, SGDClassifier, LogisticRegression]:
    print(clf)
    print(cross_val_score(clf(), X_prepared_train, y_train, scoring="accuracy").mean())

<class 'sklearn.svm.classes.LinearSVC'>




0.8869087060963411
<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>




0.8476658652455846
<class 'sklearn.linear_model.logistic.LogisticRegression'>




0.8701633198680092
CPU times: user 34min 39s, sys: 18.9 s, total: 34min 58s
Wall time: 21min 10s


<IPython.core.display.Javascript object>