In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreychubin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Подготавливаю кастомный токенизатор + очищение от артефактов html 

In [2]:
GROUING_SPACE_REGEX = re.compile(r'([^\w]|[+])', re.UNICODE)

def simple_word_tokenizer(text, _split=GROUPING_SPACE_REGEX.split):
    return [t for t in _split(text.lower()) if t and not t.isspace()]

# использую морфолизатор (приведение к инфинитиву) и заранее отбрасываю короткие слова

def token_r(text):
    words = simple_word_tokenizer(text)
    return [morph.parse(x)[0].normal_form for x in words if len(x)>=4] 


def souping(text):
    soup = BeautifulSoup(text)
    return soup.get_text()

In [3]:
df_train = pd.read_csv('labeledTrainData.tsv', sep='\t')

df_train['review'] = df_train['review'].apply(souping)

In [4]:
df_train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
df_test = pd.read_csv('testData.tsv', sep='\t')

df_test['review'] = df_test['review'].apply(souping)

#### Использую TF-IDF вместе с кастомным токенизатором + стоп-слова

In [6]:
cv = TfidfVectorizer(tokenizer=token_r, stop_words=stops, sublinear_tf=True)

matrix = cv.fit_transform(df_train.review).toarray()

matrix_test = cv.transform(df_test.review).toarray()

#### Так как данные теперь довольно сильно разряжены, использую линейную ML-модель, в данном случае - метод опорных векторов

In [7]:
target = df_train.sentiment.copy()

X_train, X_test, y_train, y_test = train_test_split(matrix, target, random_state = 0)

svc = svm.LinearSVC(max_iter=2000)

svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

#### Проверяю модель по accuracy score, так как наивысшая accuracy - цель задания

In [8]:
accsc = accuracy_score(y_pred, y_test)

print(accsc)

0.88912


#### Применяю модель на test данные и записываю в требуемый формат

In [9]:
y_pred_test = svc.predict(matrix_test)

df_test['sentiment'] = y_pred_test

df_test.to_csv('submission.csv', index=False, columns=['id','sentiment'])

### Итог: 0.86468 на public leaderboard без использования "тяжёлых" алгоритмов