In [1]:
!pip install --quiet transformers sacremoses


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [2]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import nltk
import re
import sklearn

from tqdm import tqdm
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
from sklearn.model_selection import StratifiedKFold
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk import tokenize

In [3]:
!export PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
!export TOKENIZERS_PARALLELISM=false

In [4]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu"

# if torch.backends.mps.is_available():
#     device = torch.device('mps')
#     x = torch.ones(1, device=device)
#     print(x)
# else:
#     print("MPS device not found.")

device = "cpu"
    
print(f"Current device: {device}")

Current device: cpu


In [5]:
train_df = pd.read_csv('../datasets/train.csv')
test_df = pd.read_csv('../datasets/test.csv')

### Processing

In [6]:
# Download the Snowball stemmer for Russian language
nltk.download('stopwords')
nltk.download('punkt')

# Create a Snowball stemmer for Russian
stemmer = SnowballStemmer("russian")

def collapse_dots(input):
    # Collapse sequential dots
    input = re.sub("\.+", ".", input)
    # Collapse dots separated by whitespaces
    all_collapsed = False
    while not all_collapsed:
        output = re.sub(r"\.(( )*)\.", ".", input)
        all_collapsed = input == output
        input = output
    return output

def process_text(input):
    if isinstance(input, str):
        input = " ".join(tokenize.sent_tokenize(input))
        input = re.sub(r"http\S+", "", input)
        input = re.sub(r"\n+", ". ", input)
        for symb in ["!", ",", ":", ";", "?"]:
            input = re.sub(rf"\{symb}\.", symb, input)
        input = re.sub("[^а-яА-Яa-zA-Z0-9!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ё]+", " ", input)
        input = re.sub(r"#\S+", "", input)
        input = collapse_dots(input)
        input = input.strip()
        # input = input.lower()
    return input

train_df["Content_processed"] = train_df["Content"].apply(process_text)
test_df["Content_processed"] = test_df["Content"].apply(process_text)

# Tokenize the text using NLTK for Russian language
train_df['Content_tokenized'] = train_df['Content_processed'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x, language='russian')]))
test_df['Content_tokenized'] = test_df['Content_processed'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x, language='russian')]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danorel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/danorel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Translator & Fake detector

In [7]:
mname = "Helsinki-NLP/opus-mt-ru-en"
translation_tokenizer = AutoTokenizer.from_pretrained(mname)
translation_model = AutoModelForSeq2SeqLM.from_pretrained(mname)
translation_model.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62518, 512, padding_idx=62517)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62518, 512, padding_idx=62517)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLUActivation()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,),

### Fake detection

In [8]:
fake_detection_tokenizer = AutoTokenizer.from_pretrained("vikram71198/distilroberta-base-finetuned-fake-news-detection")
fake_detection_model = AutoModelForSequenceClassification.from_pretrained("vikram71198/distilroberta-base-finetuned-fake-news-detection")
fake_detection_model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [9]:
def translate_and_predict_fake(X_train):
    y_pred = np.array([])
    
    for x_train in tqdm(X_train):
        x_train_translated_ids = translation_tokenizer(
            x_train,
            max_length=512,
            truncation=True,
            return_tensors="pt"
        ).to(device).input_ids

        x_train_output = translation_model.generate(input_ids=x_train_translated_ids)

        x_train_translated = translation_tokenizer.batch_decode(
            x_train_output, 
            skip_special_tokens=True
        )

        x_train_fake_detection_input = fake_detection_tokenizer(
            x_train_translated, 
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(device)

        x_train_fake_detection_output = fake_detection_model(**x_train_fake_detection_input)["logits"]
        x_train_fake_detection_detached_output = x_train_fake_detection_output.detach()

        fake_detection_softmax = nn.Softmax(dim = 1)
        x_train_fake_detection_prediction_probabilities = list(fake_detection_softmax(x_train_fake_detection_detached_output))

        x, y = x_train_fake_detection_prediction_probabilities[0]
        print(x_train_fake_detection_prediction_probabilities)
        
        y_sample = 1 if x < y else 3
        y_pred = np.append(y_pred, y_sample)
        
        print(f"Prediction for sentence {x_train_translated} is {y_sample}")
        
    return y_pred

### Fine-tuning

In [10]:
!pip install --quiet accelerate -U
!pip install --quiet 'transformers[torch]' datasets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [11]:
from torch.utils.data import Dataset
from datasets import load_metric

In [12]:
class NewsDataset(Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    # convert to pytorch tensors
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item 
    
    def __len__(self):
        return len(self.labels)

In [13]:
acc_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

# we'll log several evaluation metrics as accuracy alone does not show the whole picture 
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = acc_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    return {"acc": acc, "precision": precision, "recall": recall, "f1 score": f1}

  acc_metric = load_metric("accuracy")


In [14]:
seed = 18

# separate data into data and labels 
features = train_df['Content_processed'].tolist()
targets = train_df['Suspicious_Level'].tolist()

# splitting data into training, testing, and validation sets 
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(features, targets, test_size=0.20, random_state=seed)
X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X_train, y_train, test_size=0.10, random_state=seed)

In [15]:
train_encodings = fake_detection_tokenizer(X_train, truncation=True, padding=True)         
test_encodings = fake_detection_tokenizer(X_test, truncation=True, padding=True)      
val_encodings = fake_detection_tokenizer(X_val, truncation=True, padding=True)

In [16]:
train_dataset = NewsDataset(train_encodings, y_train)
test_dataset = NewsDataset(test_encodings, y_test)
val_dataset = NewsDataset(val_encodings, y_val)

In [22]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=100,
)

In [23]:
trainer = Trainer(
    model=fake_detection_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [24]:
trainer.train()

Step,Training Loss,Validation Loss


RuntimeError: MPS backend out of memory (MPS allocated: 2.49 GB, other allocations: 4.29 GB, max allowed: 6.80 GB). Tried to allocate 48.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
trainer.evaluate(test_dataset)

### Evaluation

In [None]:
X_test = test_df['Content_processed'].tolist()
y_test = translate_and_predict_fake(X_test)

In [None]:
test_df['Suspicious_Level'] = y_test

In [None]:
test_df[['MessageId', 'Suspicious_Level']].to_csv('translation_submission.csv', index=False)