In [8]:
import pandas as pd
from plotly import express

In [9]:
data = pd.read_csv('/content/sample_data/Reddit_Combi.csv', sep=';', usecols=['Body_Title', 'label'])
data

Unnamed: 0,Body_Title,label
0,Envy to other is swallowing me Im from develop...,1
1,Nothin outta the ordinary. Paradise. Job stres...,1
2,Almost 49 and the chasm of emptiness has never...,1
3,I’m happy again After my closest friend left m...,0
4,Is it possible to recover from such a traumati...,1
...,...,...
3118,"Positive relief ideas with stress? Hi all, my ...",1
3119,Breakdown I really think I lost my mind last n...,1
3120,I feel as if I actually died a long time ago a...,1
3121,Is it wierd that i have imaginary friends at t...,1


In [10]:
# Анализ количества стрессовых и положительных твитов
data_copy = data.copy()['label'].replace({0: 'negative', 1: 'positive'})
express.pie(data_frame=data_copy, names='label', color='label')

# Предобработка текста

In [11]:
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import strip_tags

CUSTOM_FILTERS = [lambda x: x.lower(),
                  remove_stopwords,
                  strip_multiple_whitespaces,
                  strip_numeric,
                  strip_punctuation,
                  strip_short,
                  strip_tags,
                 ]
# собираем все твиты в список
data_def = data['Body_Title'].values.tolist()

# применяем фильтрацию
texts = [preprocess_string(s=i, filters=CUSTOM_FILTERS) for i in data_def]

dictionary = Dictionary(texts)
print(dictionary)

Dictionary<14372 unique tokens: ['afford', 'age', 'beetwen', 'better', 'big']...>


# Doc2Vec

In [None]:
!pip install arrow

In [13]:
from arrow import now
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

doc2vec_start = now()
doc2vec_model = Doc2Vec(
                        vector_size=100,
                        min_count=15,
                        epochs=35
                        )
# собираем числа вхождений для слов
corpus = [dictionary.doc2bow(text) for text in texts]

# создаём документ (объект)
corpus_iterable = [TaggedDocument(item, [index]) for index, item in enumerate(corpus)]

# собираем слова
doc2vec_model.build_vocab(corpus_iterable=corpus_iterable)

doc2vec_model.train(corpus_iterable=corpus_iterable, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)
data['vectors'] = doc2vec_model.dv.vectors.tolist()
print('doc2vec training time: {}'.format(now() - doc2vec_start))

doc2vec training time: 0:00:21.445201


In [14]:
# извлекаем вектора для нашего документа
doc_vectors = [doc2vec_model.dv[idx] for idx in range(len(corpus_iterable))]
doc_vectors[:1]

[array([-1.5097074e-01,  1.6235068e-01,  4.9681041e-02, -2.6586798e-01,
         7.4944898e-02, -5.8376056e-01, -2.7832624e-01,  6.9820547e-01,
        -4.4021952e-01, -1.2333795e-01, -4.6341008e-01, -6.1602926e-01,
         3.1054938e-02,  8.5828984e-03,  8.5356593e-02, -3.9617717e-01,
        -2.2702232e-01, -5.9474826e-01,  4.7497481e-01, -3.5802022e-01,
         2.7170143e-04,  3.7232019e-02,  1.1694281e-01, -6.1262183e-02,
        -1.4056741e-01,  4.3413702e-02, -2.2810701e-01, -2.3356004e-01,
        -3.8783312e-01, -1.2323368e-01,  5.3712815e-01, -3.2959323e-02,
        -2.2067893e-01,  5.0288099e-01,  1.0468564e-01,  5.6189144e-01,
         1.5504877e-01, -4.5154190e-01,  6.6181995e-02, -5.9639204e-01,
        -5.7715628e-02, -1.4038281e-01,  5.5011623e-02, -3.5406545e-01,
        -2.6003963e-01, -8.2574949e-02, -1.4731878e-02, -1.3189344e-01,
        -3.0407828e-01,  1.8160725e-02,  2.2962460e-01, -4.9665575e-03,
        -3.6831743e-01,  3.3474398e-01, -8.4942627e-01,  4.86905

# keyBERT

In [None]:
%env TOKENIZERS_PARALLELISM=false
!pip install --quiet keybert

In [16]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer

MAX_DF = 1.0
MIN_DF = 4
MODEL = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'
DOCS = data['Body_Title'].values.tolist()

model_start = now()
bert = KeyBERT(model=MODEL)
bert.max_seq_length = 512
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=STOP_WORDS, min_df=MIN_DF, max_df=MAX_DF)
document_embeddings, word_embeddings = bert.extract_embeddings(docs=DOCS, vectorizer=vectorizer)
data['embedding'] = document_embeddings.tolist()
print(f'Embedding time: {now() - model_start}')
print(f'1. {len(document_embeddings)} documents\n2. {len(word_embeddings)} words')



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding time: 0:00:26.107331
1. 3123 documents
2. 4663 words


In [17]:
def show_info(n_scores_1, n_scores_2, time_1, time_2, y_test_1, y_test_2, y_pred_1, y_pred_2):
  print('\n-------------- with Doc2vec --------------')
  print('Accuracy: %.3f (%.3f)' % (mean(n_scores_1), std(n_scores_1)))
  print(f'LGBMClassifier done in {time_1}\n')
  print(classification_report(y_true=y_test_1, y_pred=y_pred_1))
  print('\n-------------- with keyBert --------------')
  print('Accuracy: %.3f (%.3f)' % (mean(n_scores_2), std(n_scores_2)))
  print(f'LGBMClassifier done in {time_2}\n')
  print(classification_report(y_true=y_test_2, y_pred=y_pred_2))

# LightGBMClassifier

In [18]:
from numpy import mean
from numpy import std
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

lgb_model = LGBMClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# Используем данные после обаботки Doc2Vec
X_train, X_test, y_train, y_test = train_test_split(data['vectors'].apply(func=pd.Series),
                                                    data['label'],
                                                    test_size=0.25,
                                                    random_state=1,
                                                    stratify=data['label']
                                                    )
n_scores_doc2vec = cross_val_score(lgb_model, data['vectors'].apply(func=pd.Series), data['label'], scoring='accuracy', cv=cv, n_jobs=-1)
time_start = now()
lgb_model = lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_test)
time_stop = now() - time_start

# Используем данные после обаботки keyBERT
lgb_model = LGBMClassifier()
Xe_train, Xe_test, ye_train, ye_test = train_test_split(data['embedding'].apply(func=pd.Series),
                                                    data['label'],
                                                    test_size=0.25,
                                                    random_state=1,
                                                    stratify=data['label']
                                                    )
n_scores_keybert = cross_val_score(lgb_model, data['embedding'].apply(func=pd.Series), data['label'], scoring='accuracy', cv=cv, n_jobs=-1)
time_start = now()
lgb_model = lgb_model.fit(Xe_train,ye_train)
ye_pred_lgb = lgb_model.predict(Xe_test)
time_stop_2 = now() - time_start

show_info(n_scores_doc2vec,n_scores_keybert,time_stop,time_stop_2,y_test,ye_test,y_pred_lgb,ye_pred_lgb)

[LightGBM] [Info] Number of positive: 2059, number of negative: 283
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002943 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 2342, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.879163 -> initscore=1.984529
[LightGBM] [Info] Start training from score 1.984529
[LightGBM] [Info] Number of positive: 2059, number of negative: 283
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011846 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 2342, number of used features: 384
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.879163 -> initscore=1.984529
[LightGBM] [Info] Start training from score 1.984529

-------------- with

# XGBoostClassifier

In [19]:
from xgboost import XGBClassifier

params_xgb = {
    'objective': 'binary:logistic',
    'n_estimators': 250,
    'learning_rate': 0.10,
    'colsample_bytree': 0.70,
    'max_depth': 3,
    'n_jobs': 4
}

# Используем данные после обаботки Doc2Vec
xgb_model = XGBClassifier(**params_xgb)
n_scores_doc2vec = cross_val_score(xgb_model, data['vectors'].apply(func=pd.Series), data['label'], scoring='accuracy', cv=cv, n_jobs=-1)
time_start = now()
xgb_model = xgb_model.fit(X_train,y_train)
y_pred_xgb = xgb_model.predict(X_test)
time_stop = now() - time_start

# Используем данные после обаботки keyBERT
xgb_model = XGBClassifier(**params_xgb)
n_scores_keybert = cross_val_score(xgb_model, data['embedding'].apply(func=pd.Series), data['label'], scoring='accuracy', cv=cv, n_jobs=-1)
time_start = now()
xgb_model = xgb_model.fit(Xe_train,ye_train)
ye_pred_xgb = xgb_model.predict(Xe_test)
time_stop_2 = now() - time_start

show_info(n_scores_doc2vec,n_scores_keybert,time_stop,time_stop_2,y_test,ye_test,y_pred_xgb,ye_pred_xgb)


-------------- with Doc2vec --------------
Accuracy: 0.881 (0.009)
LGBMClassifier done in 0:00:01.962198

              precision    recall  f1-score   support

           0       0.33      0.05      0.09        95
           1       0.88      0.99      0.93       686

    accuracy                           0.87       781
   macro avg       0.61      0.52      0.51       781
weighted avg       0.82      0.87      0.83       781


-------------- with keyBert --------------
Accuracy: 0.940 (0.012)
LGBMClassifier done in 0:00:03.870904

              precision    recall  f1-score   support

           0       0.87      0.56      0.68        95
           1       0.94      0.99      0.96       686

    accuracy                           0.94       781
   macro avg       0.91      0.77      0.82       781
weighted avg       0.93      0.94      0.93       781



#CatBoostClassifier

In [None]:
!pip install catboost

In [22]:
from catboost import CatBoostClassifier

params_cb = {
    'iterations': 100,
    'random_seed': 42,
    'learning_rate': 0.5
}

# Используем данные после обаботки Doc2Vec
catboost_model = CatBoostClassifier(**params_cb)
n_scores_doc2vec = cross_val_score(catboost_model, data['vectors'].apply(func=pd.Series), data['label'], scoring='accuracy', cv=cv, n_jobs=-1)
time_start = now()
catboost_model = catboost_model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)
y_pred_catboost = catboost_model.predict(X_test)
time_stop = now() - time_start

# Используем данные после обаботки keyBERT
catboost_model = CatBoostClassifier(**params_cb)
n_scores_keybert = cross_val_score(catboost_model, data['embedding'].apply(func=pd.Series), data['label'], scoring='accuracy', cv=cv, n_jobs=-1)
time_start = now()
catboost_model = catboost_model.fit(Xe_train, ye_train, eval_set=(Xe_test, ye_test), verbose=False)
ye_pred_catboost = catboost_model.predict(Xe_test)
time_stop_2 = now() - time_start

show_info(n_scores_doc2vec,n_scores_keybert,time_stop,time_stop_2,y_test,ye_test,y_pred_catboost,ye_pred_catboost)


-------------- with Doc2vec --------------
Accuracy: 0.868 (0.010)
LGBMClassifier done in 0:00:02.478343

              precision    recall  f1-score   support

           0       0.50      0.01      0.02        95
           1       0.88      1.00      0.94       686

    accuracy                           0.88       781
   macro avg       0.69      0.50      0.48       781
weighted avg       0.83      0.88      0.82       781


-------------- with keyBert --------------
Accuracy: 0.934 (0.012)
LGBMClassifier done in 0:00:08.560602

              precision    recall  f1-score   support

           0       0.83      0.51      0.63        95
           1       0.93      0.99      0.96       686

    accuracy                           0.93       781
   macro avg       0.88      0.75      0.79       781
weighted avg       0.92      0.93      0.92       781

