## SKLearn gb classifier

Using already created lemmatization in notebook. The whole pipeline is in gradient_boosting_classifier_lightgbm notebook

In [1]:
import numpy as np
import pandas as pd

### Text preprocessing

In [2]:
df_train = pd.read_csv('data/rusentiment/train_lemmatized.csv')
df_test = pd.read_csv('data/rusentiment/test_lemmatized.csv')

In [58]:
def make_lemms_usable_again(a):
    
    for s in [",","'","]","["]:
        a = a.apply(lambda x:x.replace(s,""))
    return a

In [30]:
df_train['lemmatized_text'] = make_lemms_usable_again(df_train['lemmatized_tokens'])
df_test['lemmatized_text'] = make_lemms_usable_again(df_test['lemmatized_tokens'])

### Encoding labels of classes

In [41]:
from sklearn import preprocessing

In [42]:
le = preprocessing.LabelEncoder()

In [44]:
X_train = df_train['lemmatized_text'].values
X_test = df_test['lemmatized_text'].values

Y_train = le.fit_transform(df_train['label'].values)
Y_test = le.transform(df_test['label'].values)

### Learning model

In [53]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [1]:
#function for fast model scoring

from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

def score_model(y_pred, y_true):
    average = 'weighted'
    f1 = f1_score(y_true, y_pred, average=average)
    precision = precision_score(y_true, y_pred, average=average)
    recall = recall_score(y_true, y_pred, average=average)
    return f1, precision, recall

def check_model(y_pred,Y_test):
    f1, precision, recall = score_model(y_pred,Y_test)
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'f1-score: {f1:.4f}')


In [36]:
k = 5
skf = StratifiedKFold(n_splits=k, random_state=42)

In [63]:
gbc_pipeline = Pipeline([
    ('tfidf',TfidfVectorizer(ngram_range=(1,1), sublinear_tf=False)), 
    ('gbc', GradientBoostingClassifier() )
])

param_grid = {
    'gbc__learning_rate':[0.1, 0.05, 0.01],
    'gbc__n_estimators':[100,200,250],
    'gbc__max_depth':[3,5]
}

grid = GridSearchCV(estimator=gbc_pipeline,scoring='f1_macro', param_grid=param_grid, cv=skf)

In [64]:
grid_model = grid.fit(X_train,Y_train)

In [65]:
grid_model.best_params_

{'gbc__learning_rate': 0.1, 'gbc__max_depth': 5, 'gbc__n_estimators': 250}

In [None]:
#{'gbc__learning_rate': 0.1, 'gbc__max_depth': 5, 'gbc__n_estimators': 250}

In [67]:
Y_pred = grid_model.predict(X_test)

In [69]:
check_model(Y_pred,Y_test)

Precision: 0.6476
Recall: 0.6552
f1-score: 0.6175


### Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(Y_test, Y_pred)
conf_mat_ratios = np.round(conf_mat / np.sum(conf_mat, axis=0),3)
fig, ax = plt.subplots(figsize=(10,8))

labels = sorted(list(set(le.inverse_transform(Y_pred))))
sns.heatmap(conf_mat_ratios, annot=True, center=0, xticklabels=labels, yticklabels=labels)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

### Errors observing

In [None]:
y_pred_labels = le.inverse_transform(Y_pred)
y_test_labels = le.inverse_transform(Y_test)

results = pd.DataFrame({'text':df_test.text, 'lemmatized_tokens':df_test.lemmatized_tokens,'predicted':y_pred_labels, 'actual':y_test_labels})
errors = results[results.predicted != results.actual]

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
freq = pd.DataFrame({'Predicted': errors.predicted.value_counts(), 'Actual':errors.actual.value_counts()})
freq.sort_values(by='Predicted', ascending=False).plot.bar(rot=0, ax=ax);

In [None]:
errors