In [2]:
import numpy as np
import pandas as pd

In [3]:
df_selected = pd.read_csv('data/rusentiment/rusentiment_preselected_posts.csv')
df_random = pd.read_csv('data/rusentiment/rusentiment_random_posts.csv')
df_test = pd.read_csv('data/rusentiment/rusentiment_test.csv')

In [4]:
df_train = pd.concat((df_selected, df_random))#available other ways to concatenate
df_train.reset_index(inplace=True, drop=True)

In [12]:
from ufal.udpipe import Model, Pipeline

In [13]:
# Original code https://github.com/akutuzov/webvectors/blob/master/preprocessing/rus_preprocessing_udpipe.py

def tag_ud(pipeline, text='Текст нужно передать функции в виде строки!', pos=True):
    # если частеречные тэги не нужны (например, их нет в модели), выставьте pos=False
    # в этом случае на выход будут поданы только леммы

    # обрабатываем текст, получаем результат в формате conllu:
    processed = pipeline.process(text)

    # пропускаем строки со служебной информацией:
    content = [l for l in processed.split('\n') if not l.startswith('#')]

    # извлекаем из обработанного текста лемму и тэг
    tagged = [w.split('\t')[2].lower() + '_' + w.split('\t')[3] for w in content if w]

    tagged_propn = []
    propn = []
    for t in tagged:
        if t.endswith('PROPN'):
            if propn:
                propn.append(t)
            else:
                propn = [t]
        elif t.endswith('PUNCT'):
            propn = []
            continue  # я здесь пропускаю знаки препинания, но вы можете поступить по-другому
        else:
            if len(propn) > 1:
                name = '::'.join([x.split('_')[0] for x in propn]) + '_PROPN'
                tagged_propn.append(name)
            elif len(propn) == 1:
                tagged_propn.append(propn[0])
            tagged_propn.append(t)
            propn = []
    if not pos:
        tagged_propn = [t.split('_')[0] for t in tagged_propn]
    return tagged_propn

In [14]:
model = Model.load('udpipe_syntagrus.model')
process_pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') 

In [15]:
df_train['lemmatized_tokens'] = df_train.apply(lambda x: tag_ud(process_pipeline, x['text'], pos=False), axis=1)
df_test['lemmatized_tokens'] = df_test.apply(lambda x: tag_ud(process_pipeline, x['text'], pos=False), axis=1)

In [16]:
from sklearn import preprocessing

In [17]:
le = preprocessing.LabelEncoder()

In [20]:
df_train['lemmatized_text'] = df_train['lemmatized_tokens'].str.join(' ')
df_test['lemmatized_text'] = df_test['lemmatized_tokens'].str.join(' ')

In [38]:
X_train = df_train['lemmatized_text'].values
X_test = df_test['lemmatized_text'].values

Y_train = le.fit_transform(df_train['label'].values)
Y_test = le.transform(df_test['label'].values)

### Using TfidfVectorizer

In [125]:
vectorizer = TfidfVectorizer(ngram_range=(1,1))
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

### Split train dataset

In [142]:
X_train_split, X_valid, Y_train_split, Y_valid= train_test_split(X_train_vect, Y_train, test_size=0.33, shuffle=True)

In [115]:
import lightgbm as lgb

In [143]:
dataset_train = lgb.Dataset(X_train_split, label = Y_train_split)

In [144]:
dataset_valid = lgb.Dataset(X_valid, label = Y_valid)

In [174]:
test_data = dataset_train.create_valid(dataset_valid)

### Metric functions

In [181]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

def score_model(y_pred, y_true):
    average = 'weighted'
    f1 = f1_score(y_true, y_pred, average=average)
    precision = precision_score(y_true, y_pred, average=average)
    recall = recall_score(y_true, y_pred, average=average)
    return f1, precision, recall

def check_model(y_pred,Y_test):
    f1, precision, recall = score_model(y_pred,Y_test)
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'f1-score: {f1:.4f}')


### Default gradient-boosting classifier

In [154]:
default_lgbc = lgb.LGBMClassifier()

#### Fitting with validation set

In [180]:
default_lgbc.fit(X = X_train_split, y = Y_train_split, eval_set=[(X_valid,Y_valid)]);

[1]	valid_0's multi_logloss: 1.56036
[2]	valid_0's multi_logloss: 1.5197
[3]	valid_0's multi_logloss: 1.48501
[4]	valid_0's multi_logloss: 1.4553
[5]	valid_0's multi_logloss: 1.42921
[6]	valid_0's multi_logloss: 1.4064
[7]	valid_0's multi_logloss: 1.38647
[8]	valid_0's multi_logloss: 1.36835
[9]	valid_0's multi_logloss: 1.35248
[10]	valid_0's multi_logloss: 1.33785
[11]	valid_0's multi_logloss: 1.32524
[12]	valid_0's multi_logloss: 1.31393
[13]	valid_0's multi_logloss: 1.30379
[14]	valid_0's multi_logloss: 1.29416
[15]	valid_0's multi_logloss: 1.28528
[16]	valid_0's multi_logloss: 1.27744
[17]	valid_0's multi_logloss: 1.26994
[18]	valid_0's multi_logloss: 1.26333
[19]	valid_0's multi_logloss: 1.25698
[20]	valid_0's multi_logloss: 1.25123
[21]	valid_0's multi_logloss: 1.24566
[22]	valid_0's multi_logloss: 1.24062
[23]	valid_0's multi_logloss: 1.23574
[24]	valid_0's multi_logloss: 1.23159
[25]	valid_0's multi_logloss: 1.22725
[26]	valid_0's multi_logloss: 1.22349
[27]	valid_0's multi_log

In [183]:
check_model(default_lgbc.predict(X_test_vect), Y_test)

Precision: 0.6238
Recall: 0.6380
f1-score: 0.5929


#### Fitting without validation set

In [185]:
default_lgbc.fit(X = X_train_vect, y = Y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [186]:
check_model(default_lgbc.predict(X_test_vect),Y_test)

Precision: 0.6314
Recall: 0.6481
f1-score: 0.6082


### Parameters tuning

In [None]:
#learning rate (default = 0.1) - small with large num_iterations
#n_estimators (default =100) - large


In [189]:
from  sklearn.model_selection import GridSearchCV

In [191]:
param_grid = {'learning_rate':[0.1, 0.05, 0.01],
              'n_estimators':[100,200,300]}

In [201]:
grid = GridSearchCV(estimator=default_lgbc, param_grid = param_grid )

In [None]:
#grid = RandomizedSearchCV(svc_pipeline, param_grid, n_iter=10, scoring='f1_macro', cv=skf, n_jobs=-1)

In [205]:
grid_model = grid.fit(X = X_train_vect, y = Y_train)



In [206]:
grid_model.best_params_

{'learning_rate': 0.05, 'n_estimators': 200}

In [210]:

check_model(grid_model.predict(X_test_vect),Y_test)

Precision: 0.6253
Recall: 0.6461
f1-score: 0.6049
