In [90]:
import codecs
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from imblearn.under_sampling import RandomUnderSampler

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Egor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Egor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Смотрим на тестовые данные

In [91]:
with codecs.open('test.csv', 'r', encoding='utf-8') as f:
    test_text = f.read()
    soup = BeautifulSoup(test_text, 'html.parser')
    test_reviews = soup.findAll('review')
    test_list = [test_review.text for test_review in test_reviews]

In [92]:
test = pd.DataFrame(test_list, columns=['text'])
test

Unnamed: 0,text
0,"Ужасно слабый аккумулятор, это основной минус ..."
1,ценанадежность-неубиваемостьдолго держит батар...
2,"подробнее в комментариях\nК сожалению, факт по..."
3,я любительница громкой музыки. Тише телефона у...
4,"Дата выпуска - 2011 г, емкость - 1430 mAh, тех..."
...,...
95,"Нет передней камеры, внутренняя память очень м..."
96,"Звук при прослушивание музыки хороший,не глючи..."
97,Очень маленькая память забита вшитыми и соверш...
98,"Удобный корпус,стандартное меню нокиа,камера д..."


# Загружаем данные для обучения

In [93]:
X_test = test['text'].values

In [94]:
train = pd.read_json('train_yandex_market.json', lines=True, encoding="utf8")

In [95]:
train['rating'].value_counts()

5    46100
4     7957
3     3850
1     3564
2     2017
Name: rating, dtype: int64

# Удаляем лишнее из отзывов

In [96]:
def cleaning(text):
    try:
        text = re.sub(r'Достоинства', ' ', text)
        text = re.sub(r'Недостатки', ' ', text)
        text = re.sub(r'Комментарий', ' ', text)
        text = re.sub(r'менее месяца', ' ', text)
        text = re.sub(r'более года', ' ', text)
        text = re.sub(r'несколько месяцев', ' ', text)
        text = ' '.join(text.split())
        return text
    except:
        return text

In [97]:
train['text'] = train.apply(lambda x: cleaning(x['text']), axis=1)
train['label'] = train['rating'].apply(lambda x: int(x == 5))

# Балансируем выборку

In [98]:
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_resample(pd.DataFrame(train['text']), train['label'])
X_resampled, y_resampled = X_resampled.values.flatten(), y_resampled.values.flatten()

# Производим подбор оптимальных гиперпараметров и модели

In [99]:
def make_pipeline(vectorizer, transformer, classifier):
    return Pipeline([
        ('vectorizer', vectorizer),
        ('transformer', transformer),
        ('classifier', classifier)
    ])

In [100]:
def make_estimator(classifier, params_grid, scorer, data, labels):
    pipeline = make_pipeline(CountVectorizer(), TfidfTransformer(), classifier)
    grid_cv = RandomizedSearchCV(pipeline, params_grid, scoring=scorer, cv=5, random_state=42, n_iter=100, verbose=True, n_jobs=-2)
    grid_cv.fit(data, labels)
    return grid_cv

In [101]:
parameters_grid_vectorizer = {
    'vectorizer__ngram_range': [(1, 2), (1, 3)],
    'vectorizer__min_df': [1, 2, 3],
    'vectorizer__max_df': [0.95, 1.0],
}

In [102]:
parameters_grid_transformer = {
    'transformer__use_idf': [False, True],
    'transformer__sublinear_tf': [True, False],
    'transformer__smooth_idf': [False, True],
    'transformer__norm': ['l2', 'l1']
}

In [103]:
#LinearSVC
parameters_grid_lsvc = {
    'classifier__loss': ['hinge', 'squared_hinge'],
    'classifier__max_iter': np.arange(200, 1000, 200),
    'classifier__tol': [1e-5, 1e-4, 1e-3],
    'classifier__C': [0.7, 0.9, 1.1, 1.3]
}
parameters_grid_lr = {
    'classifier__C' : [0.8, 0.9, 1.1, 1.2],
    'classifier__max_iter' : [45, 50, 75, 90],
    'classifier__solver' : ['newton-cg', 'lbfgs','sag'],
    'classifier__penalty' : ['l2'],
}

In [104]:
%%time
LR_grid_search = make_estimator(LogisticRegression(random_state=42),
                                {**parameters_grid_vectorizer, **parameters_grid_transformer, **parameters_grid_lr},
                                'accuracy',
                                np.concatenate([X_resampled[:900], X_resampled[18000:18900]]),
                                np.concatenate([y_resampled[:900], y_resampled[18000:18900]]))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
CPU times: total: 3.98 s
Wall time: 1min 3s


In [105]:
print(LR_grid_search.best_score_, LR_grid_search.best_params_)

0.78 {'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 2, 'vectorizer__max_df': 1.0, 'transformer__use_idf': True, 'transformer__sublinear_tf': True, 'transformer__smooth_idf': False, 'transformer__norm': 'l2', 'classifier__solver': 'sag', 'classifier__penalty': 'l2', 'classifier__max_iter': 90, 'classifier__C': 1.1}


# Обучаем модель

In [106]:
%%time
LR = make_pipeline(CountVectorizer(ngram_range=(1, 3), min_df=3, max_df=0.95),
                   TfidfTransformer(smooth_idf=False),
                   LogisticRegression(solver='lbfgs', max_iter=50, C=1.1))
LR.fit(X_resampled, y_resampled)

CPU times: total: 12.9 s
Wall time: 12.9 s


Pipeline(steps=[('vectorizer',
                 CountVectorizer(max_df=0.95, min_df=3, ngram_range=(1, 3))),
                ('transformer', TfidfTransformer(smooth_idf=False)),
                ('classifier', LogisticRegression(C=1.1, max_iter=50))])

In [107]:
pred = LR.predict(X_test)

In [108]:
Id = range(0, 100)
submission = pd.DataFrame({"Id": Id, "y": pd.Series(pred).apply(lambda x: 'neg' if x == 0 else 'pos')})
submission.to_csv('submission.csv', index=False)

# Сохраняем модель

In [109]:
import pickle
with open('my_LR_classifier.pkl', 'wb') as f:
    pickle.dump(LR, f)