<a href="https://colab.research.google.com/github/AlisaKarpova/Automatic-detection-and-euthymization-of-clickbait-in-Russian-language/blob/main/gpt_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Модель

In [None]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [None]:
import pandas as pd
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import gym
from gym import spaces
from gym.spaces.discrete import Discrete
import torch
from bert_score import score as bert_score
from transformers import GenerationConfig
import torch.nn as nn
from transformers import TFAutoModel, AutoTokenizer
from IPython.display import clear_output
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
import pickle

with open("word2vec (1).pickle", "rb") as f:
    loaded_model = pickle.load(f)

In [None]:
from tensorflow import keras
from keras.models import load_model
clickbait_classifier = load_model('attention (2).keras')

In [None]:
!pip install spacy
!python -m spacy download ru_core_news_sm

Collecting ru-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.7.0/ru_core_news_sm-3.7.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-sm==3.7.0)
  Downloading pymorphy3-2.0.3-py3-none-any.whl.metadata (1.9 kB)
Collecting dawg2-python>=0.8.0 (from pymorphy3>=1.0.0->ru-core-news-sm==3.7.0)
  Downloading dawg2_python-0.9.0-py3-none-any.whl.metadata (7.5 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru-core-news-sm==3.7.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.3-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dawg2_python-0.9.0-py3-none-any.whl (9.3 kB)
Downloading pymorphy3_dicts

In [None]:
import spacy
nlp = spacy.load("ru_core_news_sm")

In [None]:
from nltk.tokenize import WordPunctTokenizer

In [None]:
def create_semantic_vector(title, loaded_model):

    title = title.lower()
    tokens = tokenizer.tokenize(title)
    words_vectors = []

    for token in tokens:
        try:
            words_vectors.append(loaded_model.wv[token])
        except KeyError:
            pass

    if len(words_vectors) > 0:
        vector = np.mean(words_vectors, axis=0)
    else:
        vector = np.zeros(loaded_model.wv.vector_size, dtype=np.float32)

    return vector

In [None]:
import re
def extract_features(text):
    semantic_vector = create_semantic_vector(text, loaded_model)
    if semantic_vector is not None:
        compressed_vector = semantic_vector[:12]
    else:
        compressed_vector = np.zeros(12)


    features = {}

    doc = nlp(text)
    pos_tags = [(token.text, token.pos_) for token in doc]
    nouns = [token for token, pos in pos_tags if pos == 'NOUN']
    verbs = [token for token, pos in pos_tags if pos == 'VERB']
    features['noun_frequency'] = len(nouns) / len(pos_tags) if len(pos_tags) > 0 else 0
    features['verb_frequency'] = len(verbs) / len(pos_tags) if len(pos_tags) > 0 else 0

    quotes_flag = int(bool(re.search(r'[“”"\'‘’«»]', text)))
    features['quotes_flag'] = int(quotes_flag)

    story_flag = bool(re.search(r'истори|рассказ', text, flags=re.IGNORECASE))
    features['story_flag'] = int(story_flag)

    question_words = ['кто', 'что', 'где', 'когда', 'почему', 'как', 'какой', 'какая', 'какое', 'какие', 'зачем', 'сколько', 'куда', 'чей']
    question_word_flag = any(re.search(r'\b' + word + r'\b', text, flags=re.IGNORECASE) for word in question_words)
    features['question_word_flag'] = int(question_word_flag)


    special_words = ['самый', 'самая', 'самое', 'самые']
    suffixes = ['айш', 'ейш']
    words = tokenizer.tokenize(text)
    special_word_flag = any(word.lower() in special_words for word in words)
    suffix_flag = any(re.search(re.escape(suffix), text, flags=re.IGNORECASE) for suffix in suffixes)
    combined_flag = int(special_word_flag or suffix_flag)
    features['superlative_flag'] = combined_flag

    return compressed_vector, features

In [None]:
from sklearn.preprocessing import StandardScaler

def preprocess_texts(texts):
    vectors = []
    features_list = []

    for text in texts:
        vector, features = extract_features(text)
        vectors.append(vector)
        features_list.append(features)

    df_features = pd.DataFrame(features_list)
    scaler = StandardScaler()
    df_features_scaled = pd.DataFrame(scaler.fit_transform(df_features), columns=df_features.columns)

    return vectors, df_features_scaled

In [None]:
def predict_clickbait_probability(vector, features, classifier):
    X_vector = vector.reshape(1, 1, 12)

    X_features = np.array(list(features.values())).reshape(1, 6)

    probability = classifier.predict([X_vector, X_features])[0][1]

    return probability

In [None]:
import math

In [None]:
def compute_reward(input_text, generated_text, clickbait_classifier):

    def tokens_to_text(tokens):
        return [' '.join(''.join(tokenizer.decode(seq, skip_special_tokens=True)) for seq in tokens)]

    input_text = tokens_to_text(input_text)[0]
    generated_text = tokens_to_text(generated_text)[0]

    if not generated_text.strip():
        penalty = -1
    else:
        penalty = 0

    compressed_vector, features = preprocess_texts(generated_text)

    clickbait_probability = predict_clickbait_probability(compressed_vector, features, clickbait_classifier)
    clickbait_probability = math.exp(clickbait_probability) ** 2
    reward = (
        clickbait_probability +
        penalty)
    # print(length_score, bert_score_value, perplexity, clickbait_probability)
    return torch.tensor(reward, dtype=torch.float32).to(device)

class TextGenerationEnv(gym.Env):
    def __init__(self, tokenizer, texts, max_length=10):
        super(TextGenerationEnv, self).__init__()
        self.texts = texts  # Список текстов для обучения
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Пространство действий (токены)
        self.action_space = tokenizer.vocab_size

        # Пространство состояний (токенизированный текст)
        self.observation_space = spaces.Box(
            low=0, high=tokenizer.vocab_size, shape=(max_length,), dtype=np.int32
        )

        self.current_text = None
        self.current_step = 0
        self.generated_text = []

    def reset(self, num_episode):
        # Выбираем случайный текст для начала эпизода
        self.current_text = torch.tensor(self.tokenizer.encode(self.texts[num_episode])).to(device)
        self.current_step = 0
        self.generated_text = []
        return self.current_text


    def step(self, action):
        # Генерируем следующий токен
        generated_token = action
        self.generated_text.append(action)
        self.current_step += 1

        # Проверяем, завершен ли эпизод
        done = self.current_step >= self.max_length

        # Вычисляем награду только в конце эпизода
        reward = 0.0
        if done:
            self.generated_text = torch.tensor(self.generated_text).to(device)
            reward = compute_reward(self.current_text, self.generated_text, clickbait_classifier)

        # Возвращаем состояние как numpy-массив
        return self.current_text, reward, done, {}

In [None]:
model_name = "ai-forever/rugpt3small_based_on_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer) - 1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/574 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/551M [00:00<?, ?B/s]

In [None]:
df = pd.read_csv('random_sample.csv', encoding='utf8')
texts = list(df['Заголовок'].values)

In [None]:
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

In [None]:
env = TextGenerationEnv(tokenizer, texts)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
num_epochs = 10
for epoch in range(num_epochs):
    epoch_rewards = []
    for episode in tqdm(range(len(texts))):
        done = False
        state = env.reset(episode)
        while not done:
            state_tensor = torch.tensor(state).to(device).unsqueeze(0)
            with torch.no_grad():
                logits = model(state_tensor).logits
                action_probs = torch.softmax(logits[:, -1, :], dim=-1)
                action = torch.multinomial(action_probs, num_samples=1).item()
            next_state, reward, done, _ = env.step([action])
            state = next_state
        epoch_rewards.append(reward.item())
        optimizer.zero_grad()
        logits = model(state_tensor).logits
        loss = -reward
        loss.requires_grad=True
        loss.backward()
        optimizer.step()
    epoch_reward = np.mean(epoch_rewards)
    model.save_pretrained(f"content/trained_model_epoch_{epoch + 1}")


    with open(f'content/reward_episode_{epoch + 1}.txt', 'w') as rewards_file:
        rewards_file.write(f'Mean Reward for epoch {epoch + 1}: {epoch_reward}\n')

    print(f'Epoch {epoch + 1}: Mean Reward = {epoch_reward}')

model.save_pretrained("content/final_trained_model")
print("Модель полностью обучена и сохранена!")

In [None]:
import pandas as pd

file1_df = pd.read_csv('data.csv')
file2_df = pd.read_csv('random_sample.csv')
file1_set = set(map(tuple, file1_df.values.tolist()))
file2_set = set(map(tuple, file2_df.values.tolist()))

unique_in_file1 = file1_set - file2_set

df = pd.DataFrame(unique_in_file1, columns=file1_df.columns)

df.to_csv('output.csv', index=False)

In [None]:
filtered_df = df[df['Кликбейт/не кликбейт'] == 1]
filtered_df

Unnamed: 0,Заголовок,Кликбейт/не кликбейт
1,"Советник посла Казахстана, избивавший жену, хо...",1
4,Сиамские близнецы Люпита и Кармен шокировали м...,1
9,«Будет резкое ощущение позитива»: экономист Ми...,1
23,Какой бизнес оказался самым востребованным пос...,1
25,Будущее 5-летней малышки разбилось о скалы… Де...,1
...,...,...
6206,"Три буквы, а сколько ужаса: как ЕГЭ превратилс...",1
6208,Больше никаких дедлайнов и лайков? Госдума бор...,1
6211,Приложила руки к асфальту с запекшейся кровью:...,1
6213,Как распознать афериста — три правила от Марка...,1


In [None]:
texts = list(filtered_df['Заголовок'].values)

In [None]:
tokenizer.encode_plus('&nbsp;')

{'input_ids': [10, 38512, 31], 'attention_mask': [1, 1, 1]}

In [None]:
from transformers import AutoModelForCausalLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained("content/final_trained_model").to(device)

for input_text in texts:
    inputs = tokenizer.encode_plus(input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=150)
    input_ids = torch.tensor(inputs['input_ids']).to(device)
    attention_mask = torch.tensor(inputs['attention_mask']).to(device)

    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=256, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_text = generated_text.replace('\n', ' ')
    generated_text = generated_text.replace(input_text, '')
    print(f"Исходный текст: {input_text}")
    print(f"Сгенерированный заголовок: {generated_text}\n")