In [1]:
import json 
import glob
import re
from typing import Dict, Tuple, List, Callable, Union

import cloudpickle
import numpy as np
import pandas as pd 
from scipy.stats import ttest_rel
from scipy.sparse import csr_matrix, hstack
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, GridSearchCV
from sklearn.linear_model import SGDClassifier

from catboost import CatBoostClassifier

from nltk import ngrams, word_tokenize
from nltk.stem import SnowballStemmer

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm import tqdm



from warnings import filterwarnings
filterwarnings('ignore')

SEED = 42

# Load and clean data

In [2]:
train_path = 'data/train/'
test_path = 'data/test/'

def txt_to_df(folder: str) -> pd.DataFrame:
    pattern = re.compile(r'\d+')
    file_paths = glob.glob(folder + '/*.txt')
    data = []
    for path in tqdm(file_paths):
        with open(path, encoding='utf-8') as file: 
            review = file.read()
        id_review, rating = map(int, re.findall(pattern, path))
        data.append((id_review, review, rating))
        
    return pd.DataFrame(data=data, columns=['id', 'text', 'rating']).set_index('id')
    
def load_data(train_path: str, test_path: str) -> Tuple[pd.DataFrame]:
    train = pd.DataFrame()
    print('Load train')
    for sentiment in ['neg', 'pos', 'unsup']:
        res = txt_to_df(train_path + sentiment)
        res['sentiment'] = sentiment
        train = train.append(res)
        
    test = pd.DataFrame()
    print('Load test')
    for sentiment in ['neg', 'pos']:
        res = txt_to_df(test_path + sentiment)
        res['sentiment'] = sentiment
        test = test.append(res)
        
    return train.sort_index(), test.sort_index()

In [3]:
train, test = load_data(train_path, test_path)

Load train


100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:01<00:00, 8194.93it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:01<00:00, 8322.00it/s]
100%|█████████████████████████████████████████████████████████████████████████| 50000/50000 [00:04<00:00, 12304.00it/s]


Load test


100%|█████████████████████████████████████████████████████████████████████████| 12500/12500 [00:00<00:00, 13781.81it/s]
100%|█████████████████████████████████████████████████████████████████████████| 12500/12500 [00:00<00:00, 13339.96it/s]


In [4]:
print(train.shape)
train.head()

(75000, 3)


Unnamed: 0_level_0,text,rating,sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Story of a man who has unnatural feelings for ...,3,neg
0,Bromwell High is a cartoon comedy. It ran at t...,9,pos
0,"I admit, the great majority of films released ...",0,unsup
1,"If you like adult comedy cartoons, like South ...",7,pos
1,This is a very strange film that was long thou...,0,unsup


In [5]:
print(test.shape)
test.head()

(25000, 3)


Unnamed: 0_level_0,text,rating,sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Once again Mr. Costner has dragged out a movie...,2,neg
0,I went and saw this movie last night after bei...,10,pos
1,My boyfriend and I went to watch The Guardian....,10,pos
1,This is a pale imitation of 'Officer and a Gen...,3,neg
2,My yardstick for measuring a movie's watch-abi...,7,pos


In [6]:
train.sentiment.value_counts()

unsup    50000
neg      12500
pos      12500
Name: sentiment, dtype: int64

In [7]:
train[train.sentiment != 'unsup'].rating.value_counts()

1     5100
10    4732
8     3009
4     2696
7     2496
3     2420
2     2284
9     2263
Name: rating, dtype: int64

In [8]:
# Удалим дубликаты 
train.drop_duplicates(inplace=True)

In [9]:
# Очистка текста
def clean_text(documents: np.array, show_progress: bool = False) -> np.array:
    documents = documents.copy()
    snowball = SnowballStemmer(language='english')
    n_iterations = tqdm(range(len(documents))) if show_progress else range(len(documents))
    for i in n_iterations:
        # При сентимент анализе твитов важной фичой было наличие разных смайлов, 
        # поэтому там нельзя было оставлять только буквы.
        # Но в этой задаче мы оставляем только буквы, потому что качество не меняется.
        text = documents[i]
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        doc = word_tokenize(text.lower()) # приводим к нижнему регистру и токенизируем по словам
        # делаем стемминг
        # пробовал также удалять стоп слова но качество стало хуже
        tokens = [snowball.stem(token) for token in doc] 
        text = " ".join(tokens) # возвращаем строку
        documents[i] = text
    return documents

In [11]:
train['clean_text'] = clean_text(train.text.values, True)

100%|███████████████████████████████████████████████████████████████████████████| 74412/74412 [03:20<00:00, 372.03it/s]


In [13]:
train_sup = train[train.sentiment != 'unsup']
train_unsup = train[train.sentiment == 'unsup']

In [14]:
train_sup.sentiment = (train_sup.sentiment == 'pos').astype(int)

# TF-IDF + Linear Model

In [15]:
tf_idf = TfidfVectorizer(min_df=100, ngram_range=(1, 3), max_df=0.7, decode_error='ignore')
tf_idf.fit(pd.concat((train_unsup.clean_text, train_sup.clean_text))) 
len(tf_idf.vocabulary_)

32436

In [16]:
X_tf_idf = tf_idf.transform(train_sup.clean_text)
y_sentiment = train_sup.sentiment
y_rating = train_sup.rating

In [17]:
lin_model = SGDClassifier(penalty='elasticnet', random_state=SEED, class_weight='balanced')

params = {'alpha': np.logspace(-10, 1)}
grid = GridSearchCV(lin_model, param_grid=params, scoring='f1_weighted',
                   n_jobs=-1, cv=5, verbose=True)
grid.fit(X_tf_idf, y_sentiment)
print('Best for sentiment:')
print(grid.best_params_, grid.best_score_)

grid.fit(X_tf_idf, y_rating)
print('Best for rating')
print(grid.best_params_, grid.best_score_)
# alpha почти не отличается от параметров по-умолчанию поэтому буду использовать 1e-4

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best for sentiment:
{'alpha': 6.866488450042999e-05} 0.8830966164549062
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best for rating
{'alpha': 4.094915062380427e-05} 0.3637562810865326


# Doc2Vec + CatBoost

In [18]:
vector_size = 300
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_sup.clean_text.str.split())]
doc2vec = Doc2Vec(documents=documents, 
                   vector_size=vector_size, window=5, min_count=5)

len(doc2vec.wv.index_to_key)

19523

In [19]:
def documents_to_vectors(documents: pd.Series, show_progress: bool = False) -> np.array:
    vectors = []
    documents = tqdm(documents.str.split()) if show_progress else documents.str.split()
    for doc in documents:
        vectors.append(doc2vec.infer_vector(doc))
    return np.array(vectors)

In [20]:
X_doc2vec = documents_to_vectors(train_sup.clean_text, True)

100%|███████████████████████████████████████████████████████████████████████████| 24905/24905 [01:44<00:00, 237.46it/s]


In [25]:
boost_model = CatBoostClassifier(verbose=False)

# Compare Models

In [None]:
# И так, у нас две модели давайте попробуем их сравнить на кроссвалидации
def cross_val_score(
    model: Callable,
    X: np.ndarray,
    y: np.ndarray,
    cv: Union[int, Tuple[int, int]],
    random_state: int = 42,
    show_progress: bool = False,
) -> np.ndarray:
    """
    Cross-validation score.

    Parameters
    ----------
    model: Callable :
        model to train (e.g. RandomForestRegressor)

    X: np.ndarray :

    y: np.ndarray :

    cv Union[int, Tuple[int, int]]:
        (Default value = 5)
        number of folds or (n_folds, n_repeats)
        if int, then KFold is used
        if tuple, then RepeatedKFold is used


    random_state: int :
        (Default value = 0)
        random state for cross-validation

    show_progress: bool :
        (Default value = False)

    Returns
    -------
    np.ndarray :
        cross-validation scores [n_folds]

    """
    if type(cv) == int:
        kf = StratifiedKFold(n_splits=cv, random_state=random_state, shuffle=True)
    else:
        kf = RepeatedStratifiedKFold(n_splits=cv[0], n_repeats=cv[1], random_state=random_state)
        
    splits = tqdm(kf.split(X, y)) if show_progress else kf.split(X, y)
    scores = []
    for train_ind, test_ind in splits:
        X_train = X[train_ind]
        y_train = y[train_ind]
        X_test = X[test_ind]
        y_test = y[test_ind]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        scores.append(f1_score(y_test, y_pred, average='weighted'))

    return np.array(scores)

In [None]:
def compare_two_models(models: Dict[str, Callable], 
                   X_data: Dict[str, np.ndarray], y: np.ndarray, cv: Tuple[int, int] = (3, 10),
                   alpha: float = 0.05) -> Dict[str, np.ndarray]: 
    all_scores = {}
    for model in models:
        scores = cross_val_score(model=models[model], X=X_data[model], 
                                 y=y, cv=cv, show_progress=True)
        all_scores[model] = scores
    
    _, p_value = ttest_rel(*all_scores.values())
              
    if p_value < alpha:
        print('Модели значимо различаются между собой!!!')
    else:
        print('Модели значимо НЕ различаются между собой')    
              
    print('Средние значения на кроссвалидации...')
    print(*[f'{k}: {round(v.mean(), 4)}'for k, v in all_scores.items()], sep='\n')
              
    return all_scores

name_models = ['tf-idf +  sgd', 'doc2vec + catboost']
models = dict(zip(name_models, [lin_model, 
                                boost_model]))
X_data = dict(zip(name_models, [X_tf_idf, X_doc2vec]))

In [32]:
scores_sentiment = compare_two_models(models, X_data, y_sentiment.values)

30it [00:28,  1.06it/s]
30it [15:00, 30.00s/it]

Модели значимо различаются между собой!!!
Средние значения на кроссвалидации...
tf-idf +  sgd: 0.8932
doc2vec + catboost: 0.8322





In [36]:
models[name_models[0]].fit(X_data[name_models[0]], y_sentiment)
models[name_models[1]].fit(X_data[name_models[1]], y_sentiment)

# Для предсказания рейтинга будем использовать предсказанный sentiment
pred_lin = csr_matrix(models[name_models[0]].predict(X_data[name_models[0]]).reshape(-1, 1))
X_data[name_models[0]] = csr_matrix(hstack((X_data[name_models[0]], pred_lin)))

pred_boost = csr_matrix(models[name_models[1]].predict(X_data[name_models[1]]).reshape(-1, 1))
X_data[name_models[1]] = csr_matrix(hstack((X_data[name_models[1]], pred_boost)))

scores_rating = compare_two_models(models, X_data, y_rating.values, cv=(3, 2))

6it [00:31,  5.20s/it]
6it [21:07, 211.25s/it]

Модели значимо различаются между собой!!!
Средние значения на кроссвалидации...
tf-idf +  sgd: 0.3973
doc2vec + catboost: 0.3835





# Create Pipeline

In [22]:
# Обучим модели 
lin_model_sentiment = SGDClassifier(penalty='elasticnet', random_state=SEED, class_weight='balanced')
lin_model_sentiment.fit(X_tf_idf, y_sentiment)
lin_pred_sentiment = lin_model_sentiment.predict(X_tf_idf)
lin_model_rating = SGDClassifier(penalty='elasticnet', random_state=SEED, class_weight='balanced')
lin_model_rating.fit(csr_matrix(hstack((X_tf_idf, csr_matrix(lin_pred_sentiment.reshape(-1, 1))))),
                     y_rating)

boost_model_sentiment = CatBoostClassifier(verbose=False)
boost_model_sentiment.fit(X_doc2vec, y_sentiment)
boost_pred_sentiment = boost_model_sentiment.predict(X_doc2vec)
boost_model_rating = CatBoostClassifier(verbose=False)
boost_model_rating.fit(np.concatenate((X_doc2vec, boost_pred_sentiment.reshape(-1, 1)), axis=1),
                       y_rating)

<catboost.core.CatBoostClassifier at 0x1f59b91b4f0>

In [26]:
def predict_tf_idf_sgd(texts: pd.Series) -> Dict[str, np.array]:
    texts = texts.copy()
    texts = pd.Series(clean_text(texts.values))
    vectors = tf_idf.transform(texts)
    sentiment = lin_model_sentiment.predict(vectors)
    rating = lin_model_rating.predict(csr_matrix(hstack((vectors,
                                                         csr_matrix(sentiment.reshape(-1, 1))))))
    return {'sentiment': sentiment, 'rating': rating}
                                      
def predict_doc2vec_catboost(texts: pd.Series) -> Dict[str, np.array]:
    texts = texts.copy()
    texts = pd.Series(clean_text(texts.values))
    vectors = documents_to_vectors(texts)
    sentiment = boost_model_sentiment.predict(vectors)
    rating = boost_model_rating.predict(np.concatenate((vectors, sentiment.reshape(-1, 1)), axis=1))
    return {'sentiment': sentiment, 'rating': rating}

def predict(texts: pd.Series, is_tf_idf: bool = True) -> Dict[str, np.array]:
    """ Cross-validation score.

    Parameters
    ----------
    texts: pd.Series :
        texts for classification
    
    is_tf_idf: bool = True
        shows which model we will use 
    
    Returns
    -------
    Dict[str, np.array] :
        results
    """
    if is_tf_idf:
        return predict_tf_idf_sgd(texts)
    
    return predict_doc2vec_catboost(texts)

In [27]:
with open('data/predict.bin', 'wb') as file:
    cloudpickle.dump(predict, file)

# Test Score

In [28]:
with open('data/predict.bin', 'rb') as file:
    predict_func = cloudpickle.load(file)

In [29]:
y_test_sentiment = (test.sentiment == 'pos').astype(int)
y_test_rating = test.rating

y_pred_tf_idf = predict_func(test.text, is_tf_idf=True)
y_pred_doc2vec = predict_func(test.text, is_tf_idf=False)

In [32]:
print('Sentiment')
print('F1 Score tf_idf: ', round(f1_score(y_test_sentiment, y_pred_tf_idf['sentiment']), 4))
print('F1 Score doc2vec: ', round(f1_score(y_test_sentiment, y_pred_doc2vec['sentiment']), 4))
print()

print('Rating')
print('F1 Score tf_idf: ', round(f1_score(y_test_rating, y_pred_tf_idf['rating'], average='weighted'), 4))
print('F1 Score doc2vec: ', round(f1_score(y_test_rating, y_pred_doc2vec['rating'], average='weighted'), 4))

Sentiment
F1 Score tf_idf:  0.8992
F1 Score doc2vec:  0.8158

Rating
F1 Score tf_idf:  0.3829
F1 Score doc2vec:  0.3297
