In [67]:
import numpy as np
import pandas as pd
import re
import pickle
import nltk
import requests
import bs4
import scrapy
import random
import time
from nltk.corpus import movie_reviews, stopwords
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

In [66]:
import eli5

Using TensorFlow backend.


In [69]:
from eli5.lime import TextExplainer

Downloading sample table:

In [2]:
tags = pd.read_csv('sample_submission_6.csv')

In [3]:
tags.head()

Unnamed: 0,Id,y
0,0,neg
1,1,neg
2,2,neg
3,3,neg
4,4,neg


In [4]:
tags.shape

(100, 2)

Downloading existing reviews

In [5]:
file_obj = open('test_6.csv', encoding='utf-8')

In [6]:
data_list = list(file_obj)

In [7]:
file_obj.close()

Cleaning text 

In [9]:
text_list = ''.join(data_list)

In [10]:
text_list = re.split(r'\n</review>\n\n<review>', text_list)

In [11]:
text_list[0] = text_list[0][8:]

In [12]:
text_list[99] = text_list[0][len(text_list[99])-15:]

In [13]:
for i in range(len(text_list)):
    text_list[i] = re.sub('\n', ' ', text_list[i])

Lets check what we got

We have 100 reviewes without any sentiment class

In [15]:
len(text_list)

100

In [16]:
test_reviews = pd.DataFrame(text_list)

In [17]:
test_reviews.head()

Unnamed: 0,0
0,"Ужасно слабый аккумулятор, это основной минус ..."
1,ценанадежность-неубиваемостьдолго держит батар...
2,"подробнее в комментариях К сожалению, факт пол..."
3,я любительница громкой музыки. Тише телефона у...
4,"Дата выпуска - 2011 г, емкость - 1430 mAh, тех..."


After reading several reviews, it became clear that they relate to mobile phones. Mostly nts and nokia models. Therefore, we will parse Yandex.market

In [18]:
links = ['https://market.yandex.ru/product--telefon-nokia-asha-200/7835734/reviews?hid=91491&page='+str(i) for i in range(1,22)]

In [20]:
links2 = ['https://market.yandex.ru/product--smartfon-htc-wildfire-s/7023082/reviews?hid=91491&page='+str(i) for i in range(1,70)]

In [22]:
links3 = ['https://market.yandex.ru/product--smartfon-htc-sensation/7163330/reviews?hid=91491&page='+str(i) for i in range(1,31)]

Using BeautifulSoup with some random delay for stability

In [24]:
ratings = []
train_reviews = []
for link in links:
    st = random.uniform(0.5, 2.0)
    time.sleep(st)
    req = requests.get(link)
    parser = bs4.BeautifulSoup(req.text, 'html.parser')
    x = parser.findAll('div', attrs={'class':'n-product-review-item i-bem n-product-review-item_collapsed_yes'})
    for i in x:
        y = i.find('meta', attrs={'itemprop':'ratingValue'})
        ratings.append(y.attrs['content'])
        y1 = i.find('meta', attrs={'itemprop':'description'})
        train_reviews.append(y1.attrs['content'])

In [25]:
for link in links2:
    st = random.uniform(0.3, 1.5)
    time.sleep(st)
    req = requests.get(link)
    parser = bs4.BeautifulSoup(req.text, 'html.parser')
    x = parser.findAll('div', attrs={'class':'n-product-review-item i-bem n-product-review-item_collapsed_yes'})
    for i in x:
        y = i.find('meta', attrs={'itemprop':'ratingValue'})
        ratings.append(y.attrs['content'])
        y1 = i.find('meta', attrs={'itemprop':'description'})
        train_reviews.append(y1.attrs['content'])

In [26]:
for link in links3:
    st = random.uniform(0.3, 1.5)
    time.sleep(st)
    req = requests.get(link)
    parser = bs4.BeautifulSoup(req.text, 'html.parser')
    x = parser.findAll('div', attrs={'class':'n-product-review-item i-bem n-product-review-item_collapsed_yes'})
    for i in x:
        y = i.find('meta', attrs={'itemprop':'ratingValue'})
        ratings.append(y.attrs['content'])
        y1 = i.find('meta', attrs={'itemprop':'description'})
        train_reviews.append(y1.attrs['content'])

Let's clean up the received reviews from some characteristic words that are absent in the test sample

In [27]:
for i in range(len(train_reviews)):
    train_reviews[i] = re.sub('Достоинства:', '', train_reviews[i])

In [28]:
for i in range(len(train_reviews)):
    train_reviews[i] = re.sub('Недостатки:', '', train_reviews[i])

In [29]:
for i in range(len(train_reviews)):
    train_reviews[i] = re.sub('Комментарий:', '', train_reviews[i])

In [30]:
train_df = pd.DataFrame(train_reviews)

In [31]:
train_df[1] = ratings

We collect all this in a dataset, we believe that grades 5 and 4 indicate a good review, below - about a bad one

In [32]:
train_df[1] = train_df[1].map({'5': 1, '4': 1, '3': 0, '2': 0, '1': 0})

In [33]:
train_df.head()

Unnamed: 0,0,1
0,Качественные материалы корпуса и привлекатель...,1
1,"Приятный дизайн,хороший экран,поддержка Java(...",1
2,- Удобная Клавиатура и русская раскладка - 2 ...,1
3,1. 2 симки; 2. НЕ сенсорный (для меня это при...,1
4,1.Кверти-клава 2.Две сим-карты 3.Громкий дина...,1


In [42]:
train_df.to_csv('week6_kaggle.csv')

Now we proceed to building a model, building a pipeline, and selecting parameters using a gridsearch. We use logistic regression and CountVectorizer

In [37]:
def text_classifier2(vectorizer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
            ("classifier", classifier)]
        )

In [35]:
data = train_df[0]
labels = train_df[1]

In [38]:
cross_val_score(text_classifier2(CountVectorizer(min_df=1, ngram_range = (1, 2)), LogisticRegression(class_weight=None)), data, labels,
               cv=5,
               scoring='accuracy' 
               ).mean()



0.7733678086619262

In [39]:
pipeline2 = Pipeline(
                    [('vectorizer', CountVectorizer()),
                     ('classifier', LogisticRegression())
                     
])

In [40]:
stopWords = stopwords.words('russian')

In [41]:
parameters = {}
parameters['vectorizer__min_df'] = range(1,10)
parameters['vectorizer__ngram_range'] = [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)]
parameters['vectorizer__stop_words'] = [stopWords, None]
parameters['classifier__class_weight'] = [None, 'balanced']
parameters['classifier__penalty'] = ['l1', 'l2']

In [159]:
CV2 = GridSearchCV(pipeline2, parameters, scoring = 'accuracy')

In [160]:
CV2.fit(data, labels)   

print('Best score and parameter combination = ')

print(CV2.best_score_)    
print(CV2.best_params_)



Best score and parameter combination = 
0.7826825127334465
{'classifier__class_weight': None, 'classifier__penalty': 'l2', 'vectorizer__min_df': 8, 'vectorizer__ngram_range': (1, 2), 'vectorizer__stop_words': ['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем',

Train the best model and make a prediction for our test dataset

In [43]:
clf = text_classifier2(CountVectorizer(min_df=1, ngram_range = (1, 2), stop_words=stopWords), LogisticRegression(class_weight=None))

In [44]:
clf.fit(data, labels)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['и',...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [48]:
vectorizer = CountVectorizer(min_df=1, ngram_range = (1, 2), stop_words=stopWords)

In [49]:
vectorizer.fit(data)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', '...гда', 'лучше', 'чуть', 'том', 'нельзя', 'такой', 'им', 'более', 'всегда', 'конечно', 'всю', 'между'],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [50]:
model = LogisticRegression(class_weight=None)

In [51]:
model.fit(vectorizer.transform(data), labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [52]:
pickle.dump(vectorizer, open('vectorizer6.pkl', 'wb'))

In [53]:
pickle.dump(model, open('model6.pkl', 'wb'))

In [45]:
tags['y'] = clf.predict(text_list)

In [58]:
vectorized = vectorizer.transform([text_list[0]])

In [59]:
model.predict(vectorized)[0]

0

In [68]:
pipe = make_pipeline(vectorizer, model)
pipe.fit(data, labels)



Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [93]:
pickle.dump(pipe, open('pipe6.pkl', 'wb'))

Finally, let's try to visualize the words that contribute the most to the model

In [84]:
te = TextExplainer(random_state=42)
te.fit(text_list[4], pipe.predict_proba)
res = te.show_prediction(target_names=['negative', 'positive'], top=25)

In [91]:
type(res)

IPython.core.display.HTML

In [96]:
te.show_prediction(target_names=['negative', 'positive'], top=25)

Contribution?,Feature
+4.437,Highlighted in text (sum)
… 59 more positive …,… 59 more positive …
… 35 more negative …,… 35 more negative …


In [None]:
def print_prediction(doc):
    y_pred = pipe.predict_proba([doc])[0]
    for target, prob in zip(twenty_train.target_names, y_pred):
        print("{:.3f} {}".format(prob, target))

doc = twenty_test.data[0]
print_prediction(doc)

In [61]:
coeffs = model.coef_[0]

In [64]:
[vectorizer.get_feature_names()[list(coeffs).index(i)] for i in sorted(coeffs)[:5]]

['глючит', 'постоянно', 'сенсор', 'часто', 'использования']

In [57]:
clf.predict(text_list)

array([0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0], dtype=int64)

In [46]:
tags['y'] = tags['y'].map({0: 'neg', 1: 'pos'})

In [47]:
tags.head()

Unnamed: 0,Id,y
0,0,neg
1,1,pos
2,2,neg
3,3,neg
4,4,pos


In [191]:
tags.to_csv('sen_comp_sub6.csv', index=False)

Несмотря на то, что спарсили не так много данных (меньше 600 отзывов) и модель была достаточно простой, получаем качество в 88% на кегле. Если бы не получили, можно было бы попробовать спарсить больше данных и/или обучить более сложную модель, например, рекуррентную нейросеть. Но этого оказалось достаточно в рамках поставленной задачи, поэтому остановимся.