In [2]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModel
import torch
from scipy import spatial
from pandas import read_pickle
from sklearn.feature_extraction.text import TfidfVectorizer



data = pd.read_csv('../data/post.csv').dropna()
data = data[['telegram_id', 'text']]

In [3]:
def word_finder(text: str) -> int:
    return len(re.sub(r'\w*\d+\w*', ' ', text).split())


def only_smile(text: str) -> int:
    del_words = re.sub(r'\w', '', text)
    del_symbols = len((re.sub(r"[~.,?!{}#%№+$^&*:""+/{};|]", '', del_words)).split())
    return del_symbols


for index in data.index:
    text = data.loc[index, 'text']
    word_count = word_finder(text)  # Нашли кол-во слов
    sign_count = len(re.findall(r'!', text))  # Нашли кол-во знаков '!'
    smile_count = only_smile(text)  # Нашли кол-во смайлов
    data.loc[index, ['word_count', 'sign_count', 'smile_count']] = [word_count, sign_count, smile_count]


def symbols_rm(text : str):
    try:
        text = re.sub(r"http://\S+|https://\S+", "", text)
        text = re.sub(r'([^\s\w])', '', text).lower()
        text = re.sub(r'\w*\d+\w*', ' ', text).strip()
        text = re.sub('\s+', ' ', text)
        return re.sub(r'\n', '', text)
    except TypeError:
        print("Error with cleaning this text. Nan, perhaps?")


data["text"] = data["text"].apply(symbols_rm)

In [4]:
data = data[:2000]

In [5]:
data['emb'] = ''


tokenizer = AutoTokenizer.from_pretrained("../HF_cache/tokenizer/", max_len=512)
model = AutoModel.from_pretrained("../HF_cache/model/", num_labels=29)

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

data['emb'] = data["text"].apply(lambda x: embed_bert_cls(x, model, tokenizer))

In [None]:
def spatial_distance(vector1, vector2):
    return spatial.distance.cosine(vector1, vector2)

to_rm = []

for i in range(1, len(data)-1):
    for j in range(i+1, len(data)):
        if spatial_distance(data['emb'].iloc[i], data['emb'].iloc[j]) < 0.05:
            to_rm.append(data["text"].iloc[[i, j]].index.values)

for pair in to_rm:
    if len(data["text"][pair[0]]) >= len(data["text"][pair[1]]):
        data.drop(pair[0], axis = 0)
    else:
        data.drop(pair[1], axis = 0)

data = data.dropna()
data["sign_count"] = data["sign_count"].apply(lambda x: int(x))


In [None]:
t_id = data['telegram_id'].values
data = data.drop('telegram_id', axis = 1)

In [None]:
from catboost import CatBoostClassifier

model = read_pickle("../models/catboost_model.pkl")

preds = model.predict(data)
data['label'] = preds
data = data.drop(['emb', 'word_count', 'sign_count', 'smile_count'], axis = 1)
data['telegram_id'] = t_id


In [None]:
data

In [None]:
data.to_csv("predict.csv")