In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from scipy import sparse
from math import ceil
from textfab import Conveyer
import re
from scipy import sparse as sc 
from xgboost import XGBClassifier

In [2]:
from tqdm.auto import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score

In [3]:
from sklearn.ensemble import IsolationForest 
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight

In [4]:
EXPERIMENT_NUM = 10

In [5]:
import torch
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
# model.cuda()  # uncomment it if you have a GPU

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
import random

def shuffle(matrix):
    index = np.arange(matrix.shape[0])   
    np.random.shuffle(index)    
    return matrix[index, :]

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

def preprocess(corp):
    con = Conveyer(['remove_punct', "lower_string", "swap_enter_to_space", "collapse_spaces"])
    corp = list(map(lambda x: re.sub(r"<emoji>.+</emoji>", "", x), corp))
    corp = list(filter(lambda x: "<no text>" not in x, corp))
    corp = list(map(lambda x:re.sub("[A-Za-z]+", '', x), corp))
    corp = con.start(corp)
    corp = list(filter(lambda x: len(x) > 2, corp))
    return corp

def block_dataset(negative_vectors, positive_vectors):
    train_sets = []
    start = 0
    out_num = positive_vectors.shape[0]
    norm_num = negative_vectors.shape[0]
    while norm_num != 0:
        end = norm_num % out_num
        if end != 0 :
            if (end / out_num) > 0.5:
                train_sets.append( (negative_vectors[start:start + end], positive_vectors[start:start + end]) )
            start += end
            norm_num -= end
        else:
            end += out_num
            train_sets.append( (negative_vectors[start:start + end], positive_vectors) )
            norm_num -= out_num
            start += end
    return train_sets

def get_metrics(y_true, y_pred):
    return precision_score(y_true, y_pred, average=None), recall_score(y_true, y_pred, average=None), f1_score(y_true, y_pred, average=None)

In [6]:
data = pd.read_csv("data/presuicidal_signals_dataset_twitter.csv", sep="|")
data.text = data.text.astype("str")

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
# vectorizer.fit(data.text.to_list())
vectorizer.fit( preprocess(data.text.to_list()))

CountVectorizer()

In [8]:
normal_texts = preprocess( data[data.label == 5].text.to_list())
outlier_texts = preprocess(data[data.label.isin([1,2])].text.to_list())

In [8]:
normal_bert_vectors = [embed_bert_cls(x, model, tokenizer) for x in tqdm(normal_texts, desc="Normal vectorization")]
outlier_bert_vectors = [embed_bert_cls(x, model, tokenizer) for x in tqdm(outlier_texts, desc="Outlier vectorization")]
normal_bert_vectors = np.vstack(normal_bert_vectors)
outlier_bert_vectors = np.vstack(outlier_bert_vectors)


Normal vectorization:   0%|          | 0/26389 [00:00<?, ?it/s]

Outlier vectorization:   0%|          | 0/4934 [00:00<?, ?it/s]

In [11]:
normal_count_vectors = vectorizer.transform(normal_texts)
outlier_count_vectors = vectorizer.transform(outlier_texts)

In [12]:
report = []
f1_macro = []

# Outlier detection methods

## Count based

In [13]:
for i in tqdm(range(EXPERIMENT_NUM)):
    normal_count_vectors = shuffle(normal_count_vectors)

    normal_vectors_test = normal_count_vectors[: outlier_count_vectors.shape[0]]
    normal_vectors = normal_count_vectors[outlier_count_vectors.shape[0] : ]

    test_vectors = sc.vstack([normal_vectors_test, outlier_count_vectors])
    test_label = np.vstack([np.ones((normal_vectors_test.shape[0], 1)),
                           -1 * np.ones((normal_vectors_test.shape[0], 1))])

    clf_if = IsolationForest()
    clf_lof = LocalOutlierFactor(novelty=True)
    clf_svm = OneClassSVM(kernel="rbf", nu=outlier_count_vectors.shape[0] / normal_vectors.shape[0], gamma=1e-6)


    clf_if.fit(normal_vectors)
    m = get_metrics(test_label, clf_if.predict(test_vectors))
    zero_class = tuple([x[0] for x in m])
    first_calss = tuple([x[1] for x in m])
    report.append(("Isolation Forest", "Count", "0", *zero_class))
    report.append(("Isolation Forest", "Count", "1", *first_calss))
    f1_macro.append( ("Isolation Forest", "Count", f1_score(test_label, clf_if.predict(test_vectors), average="macro")) )
    
    clf_lof.fit(normal_vectors)
    m = get_metrics(test_label, clf_lof.predict(test_vectors))
    zero_class = tuple([x[0] for x in m])
    first_calss = tuple([x[1] for x in m])
    report.append(("Local Outlier Factor", "Count", "0", *zero_class))
    report.append(("Local Outlier Factor", "Count", "1", *first_calss))
    f1_macro.append( ("Local Outlier Factor", "Count", f1_score(test_label, clf_lof.predict(test_vectors), average="macro")))
                  
    clf_svm.fit(normal_vectors)
    m = get_metrics(test_label, clf_svm.predict(test_vectors))
    zero_class = tuple([x[0] for x in m])
    first_calss = tuple([x[1] for x in m])
    report.append(("OneClassSVM", "Count", "0", *zero_class))
    report.append(("OneClassSVM", "Count", "1", *first_calss))
    f1_macro.append( ("OneClassSVM", "Count", f1_score(test_label, clf_svm.predict(test_vectors), average="macro"))) 
                  

  0%|          | 0/10 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## BERT based

In [14]:
for i in tqdm(range(EXPERIMENT_NUM)):

    np.random.shuffle(normal_bert_vectors)

    normal_vectors_test = normal_bert_vectors[: outlier_bert_vectors.shape[0]]
    normal_vectors = normal_bert_vectors[outlier_bert_vectors.shape[0] : ]

    test_vectors = np.vstack([normal_vectors_test, outlier_bert_vectors])
    test_label = np.vstack([np.ones((normal_vectors_test.shape[0], 1)),
                           -1 * np.ones((normal_vectors_test.shape[0], 1))])

    clf_if = IsolationForest()
    clf_lof = LocalOutlierFactor(novelty=True)
    clf_svm = OneClassSVM(kernel="rbf", nu=outlier_count_vectors.shape[0] / normal_vectors.shape[0], gamma=1e-6)


    clf_if.fit(normal_vectors)
    m = get_metrics(test_label, clf_if.predict(test_vectors))
    zero_class = tuple([x[0] for x in m])
    first_calss = tuple([x[1] for x in m])
    report.append(("Isolation Forest", "BERT", "0", *zero_class))
    report.append(("Isolation Forest", "BERT", "1", *first_calss))
    f1_macro.append( ("Isolation Forest", "BERT", f1_score(test_label, clf_if.predict(test_vectors), average="macro")))
    
    clf_lof.fit(normal_vectors)
    m = get_metrics(test_label, clf_lof.predict(test_vectors))
    zero_class = tuple([x[0] for x in m])
    first_calss = tuple([x[1] for x in m])
    report.append(("Local Outlier Factor", "BERT", "0", *zero_class))
    report.append(("Local Outlier Factor", "BERT", "1", *first_calss))
    f1_macro.append( ("Local Outlier Factor", "BERT", f1_score(test_label, clf_lof.predict(test_vectors), average="macro")))

    clf_svm.fit(normal_vectors)
    m = get_metrics(test_label, clf_svm.predict(test_vectors))
    zero_class = tuple([x[0] for x in m])
    first_calss = tuple([x[1] for x in m])
    report.append(("OneClassSVM", "BERT", "0", *zero_class))
    report.append(("OneClassSVM", "BERT", "1", *first_calss))
    f1_macro.append( ("OneClassSVM", "BERT", f1_score(test_label, clf_svm.predict(test_vectors), average="macro")))

  0%|          | 0/10 [00:00<?, ?it/s]

# Traditional methods

## Count based

In [15]:
vectors = sc.vstack([normal_count_vectors, outlier_count_vectors])
label = np.vstack([np.zeros((normal_count_vectors.shape[0], 1)),
                    np.ones((outlier_count_vectors.shape[0], 1))]).flatten()

In [16]:
class_weights = compute_class_weight('balanced', classes=[0,1], y=label)

In [17]:
for i in tqdm(range(EXPERIMENT_NUM)):
    x_tr, x_ts, y_tr, y_ts = train_test_split(vectors, label)
    
    clf = LogisticRegression(max_iter=1000, C=1, class_weight={0:class_weights[0], 1:class_weights[1]})
    clf.fit(x_tr, y_tr, )
    m = get_metrics(y_ts, clf.predict(x_ts))
    zero_class = tuple([x[0] for x in m])
    first_calss = tuple([x[1] for x in m])
    report.append(("Logistic Regression", "Count", "0", *zero_class))
    report.append(("Logistic Regression", "Count", "1", *first_calss))
    f1_macro.append( ("Logistic Regression", "Count", f1_score(y_ts, clf.predict(x_ts), average="macro")))
    
    clf = XGBClassifier(scale_pos_weight=class_weights[1])
    clf.fit(x_tr, y_tr,)
    m = get_metrics(y_ts, clf.predict(x_ts))
    zero_class = tuple([x[0] for x in m])
    first_calss = tuple([x[1] for x in m])
    report.append(("XGBoost", "Count", "0", *zero_class))
    report.append(("XGBoost", "Count", "1", *first_calss))
    f1_macro.append( ("XGBoost", "Count", f1_score(y_ts, clf.predict(x_ts), average="macro")))
    
    clf = RandomForestClassifier(n_jobs=5, class_weight={0:class_weights[0], 1:class_weights[1]} )
    clf.fit(x_tr, y_tr, )
    m = get_metrics(y_ts, clf.predict(x_ts))
    zero_class = tuple([x[0] for x in m])
    first_calss = tuple([x[1] for x in m])
    report.append(("Random Forest", "Count", "0", *zero_class))
    report.append(("Random Forest", "Count", "1", *first_calss))
    f1_macro.append( ("Random Forest", "Count", f1_score(y_ts, clf.predict(x_ts), average="macro")))

  0%|          | 0/10 [00:00<?, ?it/s]









































## BERT based

In [11]:
vectors = np.vstack([normal_bert_vectors, outlier_bert_vectors])
label = np.vstack([np.zeros((normal_bert_vectors.shape[0], 1)),
                    np.ones((outlier_bert_vectors.shape[0], 1))]).flatten()

In [12]:
class_weights = compute_class_weight('balanced', classes=[0,1], y=label)

In [20]:
for i in tqdm(range(EXPERIMENT_NUM)): 
    x_tr, x_ts, y_tr, y_ts = train_test_split(vectors, label)
    
    clf = LogisticRegression(max_iter=1000, C=1, class_weight={0:class_weights[0], 1:class_weights[1]})
    clf.fit(x_tr, y_tr, )
    m = get_metrics(y_ts, clf.predict(x_ts))
    zero_class = tuple([x[0] for x in m])
    first_calss = tuple([x[1] for x in m])
    report.append(("Logistic Regression", "BERT", "0", *zero_class))
    report.append(("Logistic Regression", "BERT", "1", *first_calss))
    f1_macro.append( ("Logistic Regression", "BERT", f1_score(y_ts, clf.predict(x_ts), average="macro")))
    
    clf = XGBClassifier(scale_pos_weight=class_weights[1])
    clf.fit(x_tr, y_tr,)
    m = get_metrics(y_ts, clf.predict(x_ts))
    zero_class = tuple([x[0] for x in m])
    first_calss = tuple([x[1] for x in m])
    report.append(("XGBoost", "BERT", "0", *zero_class))
    report.append(("XGBoost", "BERT", "1", *first_calss))
    f1_macro.append( ("XGBoost", "BERT", f1_score(y_ts, clf.predict(x_ts), average="macro")))
    
    clf = RandomForestClassifier(n_jobs=5, class_weight={0:class_weights[0], 1:class_weights[1]} )
    clf.fit(x_tr, y_tr, )
    m = get_metrics(y_ts, clf.predict(x_ts))
    zero_class = tuple([x[0] for x in m])
    first_calss = tuple([x[1] for x in m])
    report.append(("Random Forest", "BERT", "0", *zero_class))
    report.append(("Random Forest", "BERT", "1", *first_calss))
    f1_macro.append( ("Random Forest", "BERT", f1_score(y_ts, clf.predict(x_ts), average="macro")))

  0%|          | 0/10 [00:00<?, ?it/s]









































# Ensable

## CountBased

In [21]:
for i in tqdm(range(EXPERIMENT_NUM)):
    outlier_count_vectors = shuffle(outlier_count_vectors)
    normal_count_vectors = shuffle(normal_count_vectors)

    outlier_vectors_count_train = outlier_count_vectors[:ceil(0.77 * outlier_count_vectors.shape[0])]
    outlier_vectors_count_test = outlier_count_vectors[ceil(0.77 * outlier_count_vectors.shape[0]):]

    normal_vectors_count_train = normal_count_vectors[:ceil(0.77 * normal_count_vectors.shape[0])]
    normal_vectors_count_test = normal_count_vectors[ceil(0.77 * normal_count_vectors.shape[0]):]

    models = []
    for normal, outlier  in block_dataset(normal_vectors_count_train, outlier_vectors_count_train):
        vectors = sparse.vstack([normal, outlier])#np
        label = np.hstack([np.zeros((normal.shape[0])),
                        np.ones((outlier.shape[0]))])
        x_tr, x_ts, y_tr, y_ts = train_test_split(vectors, label, train_size=0.95)
        clf = LogisticRegression(max_iter = 1000, C=1)
        clf.fit(x_tr, y_tr)
        #print(classification_report(y_ts,clf.predict(x_ts)))
        models.append(clf)

    vectors_train = sparse.vstack([normal_vectors_count_train, outlier_vectors_count_train]) # np
    features = np.hstack([x.predict_proba(vectors_train) for x in models])

    label = np.hstack([np.zeros((normal_vectors_count_train.shape[0])),
                        np.ones((outlier_vectors_count_train.shape[0]))])

    #x_tr, x_ts, y_tr, y_ts = train_test_split(features, label, train_size=0.77)

    class_weights = compute_class_weight('balanced', classes=[0,1], y=label)

    clf = LogisticRegression(max_iter = 1000, C=1, class_weight={0:class_weights[0], 1:class_weights[1]})
    clf.fit(features, label)
    
    vectors_test = sparse.vstack([normal_vectors_count_test, outlier_vectors_count_test])
    features = np.hstack([x.predict_proba(vectors_test) for x in models])

    label = np.hstack([np.zeros((normal_vectors_count_test.shape[0])),
                        np.ones((outlier_vectors_count_test.shape[0]))])
    
    m = get_metrics(label, clf.predict(features))
    zero_class = tuple([x[0] for x in m])
    first_calss = tuple([x[1] for x in m])
    report.append(("LogReg Stack", "Count", "0", *zero_class))
    report.append(("LogReg Stack", "Count", "1", *first_calss))
    f1_macro.append( ("LogReg Stack", "Count", f1_score(label, clf.predict(features), average="macro")))

  0%|          | 0/10 [00:00<?, ?it/s]

## BERT based

In [22]:
for i in tqdm(range(EXPERIMENT_NUM)):
    np.random.shuffle(normal_bert_vectors)
    np.random.shuffle(outlier_bert_vectors)
    
    outlier_vectors_bert_train = outlier_bert_vectors[:ceil(0.77 * outlier_bert_vectors.shape[0])]
    outlier_vectors_bert_test = outlier_bert_vectors[ceil(0.77 * outlier_bert_vectors.shape[0]):]

    normal_vectors_bert_train = normal_bert_vectors[:ceil(0.77 * normal_bert_vectors.shape[0])]
    normal_vectors_bert_test = normal_bert_vectors[ceil(0.77 * normal_bert_vectors.shape[0]):]

    models = []
    for normal, outlier  in block_dataset(normal_vectors_bert_train, outlier_vectors_bert_train):
        vectors = np.vstack([normal, outlier])#np
        label = np.hstack([np.zeros((normal.shape[0])),
                        np.ones((outlier.shape[0]))])
        x_tr, x_ts, y_tr, y_ts = train_test_split(vectors, label, train_size=0.95)
        clf = LogisticRegression(max_iter = 1000, C=1)
        clf.fit(x_tr, y_tr)
        #print(classification_report(y_ts,clf.predict(x_ts)))
        models.append(clf)

    vectors_train = np.vstack([normal_vectors_bert_train, outlier_vectors_bert_train]) # np
    features = np.hstack([x.predict_proba(vectors_train) for x in models])

    label = np.hstack([np.zeros((normal_vectors_bert_train.shape[0])),
                        np.ones((outlier_vectors_bert_train.shape[0]))])

    #x_tr, x_ts, y_tr, y_ts = train_test_split(features, label, train_size=0.77)

    class_weights = compute_class_weight('balanced', classes=[0,1], y=label)

    clf = LogisticRegression(max_iter = 1000, C=1, class_weight={0:class_weights[0], 1:class_weights[1]})
    clf.fit(features, label)
    
    vectors_test = np.vstack([normal_vectors_bert_test, outlier_vectors_bert_test])
    features = np.hstack([x.predict_proba(vectors_test) for x in models])

    label = np.hstack([np.zeros((normal_vectors_bert_test.shape[0])),
                        np.ones((outlier_vectors_bert_test.shape[0]))])
    
    m = get_metrics(label, clf.predict(features))
    zero_class = tuple([x[0] for x in m])
    first_calss = tuple([x[1] for x in m])
    report.append(("LogReg Stack", "BERT", "0", *zero_class))
    report.append(("LogReg Stack", "BERT", "1", *first_calss))
    f1_macro.append( ("LogReg Stack", "BERT", f1_score(label, clf.predict(features), average="macro")))

  0%|          | 0/10 [00:00<?, ?it/s]

In [24]:
import json
json.dump(report, open("results.json", 'w'))

In [25]:
t = pd.DataFrame(report, columns=["method", "vec", "class", "precision", "recall", "f1"])

In [26]:
t

Unnamed: 0,method,vec,class,precision,recall,f1
0,Isolation Forest,Count,0,0.000000,0.000000,0.000000
1,Isolation Forest,Count,1,0.500000,1.000000,0.666667
2,Local Outlier Factor,Count,0,0.501734,0.996960,0.667526
3,Local Outlier Factor,Count,1,0.765625,0.009931,0.019608
4,OneClassSVM,Count,0,0.623984,0.373328,0.467157
...,...,...,...,...,...,...
275,LogReg Stack,BERT,1,0.377310,0.774250,0.507368
276,LogReg Stack,BERT,0,0.950473,0.762070,0.845908
277,LogReg Stack,BERT,1,0.382114,0.787478,0.514549
278,LogReg Stack,BERT,0,0.947562,0.762234,0.844854


In [27]:
 a = t.groupby(["method", "vec", "class"]).agg(["mean", "std"])

In [35]:
pd.options.display.float_format = '{:,.3f}'.format

In [36]:
a

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,precision,precision,recall,recall,f1,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std
method,vec,class,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Isolation Forest,BERT,0,0.558,0.351,0.0,0.0,0.001,0.0
Isolation Forest,BERT,1,0.5,0.0,1.0,0.0,0.667,0.0
Isolation Forest,Count,0,0.0,0.0,0.0,0.0,0.0,0.0
Isolation Forest,Count,1,0.5,0.0,1.0,0.0,0.667,0.0
Local Outlier Factor,BERT,0,0.301,0.022,0.009,0.001,0.017,0.001
Local Outlier Factor,BERT,1,0.497,0.001,0.979,0.003,0.659,0.001
Local Outlier Factor,Count,0,0.502,0.0,0.997,0.0,0.668,0.0
Local Outlier Factor,Count,1,0.768,0.033,0.011,0.002,0.021,0.004
LogReg Stack,BERT,0,0.948,0.002,0.765,0.005,0.847,0.003
LogReg Stack,BERT,1,0.382,0.004,0.777,0.011,0.512,0.004


In [30]:
f1m = pd.DataFrame(f1_macro, columns=["method", "vec", "f1"])

In [37]:
f1m.groupby(["method", "vec"]).agg(["mean", "std"])

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
method,vec,Unnamed: 2_level_2,Unnamed: 3_level_2
Isolation Forest,BERT,0.334,0.0
Isolation Forest,Count,0.333,0.0
Local Outlier Factor,BERT,0.338,0.001
Local Outlier Factor,Count,0.344,0.002
LogReg Stack,BERT,0.68,0.003
LogReg Stack,Count,0.617,0.018
Logistic Regression,BERT,0.685,0.005
Logistic Regression,Count,0.538,0.18
OneClassSVM,BERT,0.463,0.023
OneClassSVM,Count,0.555,0.002


In [None]:
a.to_markdown()