In [1]:
!pip install --quiet transformers sacremoses


KeyboardInterrupt



In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import nltk
import re
import sklearn

from tqdm import tqdm
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
from sklearn.model_selection import StratifiedKFold
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk import tokenize

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    print ("MPS device not found.")
    
print(f"Current device: {device}")

In [None]:
train_df = pd.read_csv('../datasets/train.csv')
test_df = pd.read_csv('../datasets/test.csv')

In [None]:
X_train, y_train = train_df.iloc[:10]['Content'].values.tolist(), train_df.iloc[:10]['Suspicious_Level']
X_test = test_df.iloc[:10]['Content'].values.tolist()

### Processing

In [None]:
# Download the Snowball stemmer for Russian language
nltk.download('stopwords')
nltk.download('punkt')

# Create a Snowball stemmer for Russian
stemmer = SnowballStemmer("russian")

def collapse_dots(input):
    # Collapse sequential dots
    input = re.sub("\.+", ".", input)
    # Collapse dots separated by whitespaces
    all_collapsed = False
    while not all_collapsed:
        output = re.sub(r"\.(( )*)\.", ".", input)
        all_collapsed = input == output
        input = output
    return output

def process_text(input):
    if isinstance(input, str):
        input = " ".join(tokenize.sent_tokenize(input))
        input = re.sub(r"http\S+", "", input)
        input = re.sub(r"\n+", ". ", input)
        for symb in ["!", ",", ":", ";", "?"]:
            input = re.sub(rf"\{symb}\.", symb, input)
        input = re.sub("[^а-яА-Яa-zA-Z0-9!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ё]+", " ", input)
        input = re.sub(r"#\S+", "", input)
        input = collapse_dots(input)
        input = input.strip()
        # input = input.lower()
    return input

train_df["Content_processed"] = train_df["Content"].apply(process_text)
test_df["Content_processed"] = test_df["Content"].apply(process_text)

# Tokenize the text using NLTK for Russian language
train_df['Content_tokenized'] = train_df['Content_processed'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x, language='russian')]))
test_df['Content_tokenized'] = test_df['Content_processed'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x, language='russian')]))

### Translator & Fake detector

In [None]:
mname = "Helsinki-NLP/opus-mt-ru-en"
translation_tokenizer = AutoTokenizer.from_pretrained(mname)
translation_model = AutoModelForSeq2SeqLM.from_pretrained(mname)
translation_model.to(device)

### Fake detection

In [None]:
fake_detection_tokenizer = AutoTokenizer.from_pretrained("vikram71198/distilroberta-base-finetuned-fake-news-detection")
fake_detection_model = AutoModelForSequenceClassification.from_pretrained("vikram71198/distilroberta-base-finetuned-fake-news-detection")
fake_detection_model.to(device)

In [None]:
def translate_and_predict_fake(X_train):
    y_pred = np.array([])
    
    for x_train in tqdm(X_train):
        x_train_translated_ids = translation_tokenizer(
            x_train,
            max_length=512,
            truncation=True,
            return_tensors="pt"
        ).to(device).input_ids

        x_train_output = translation_model.generate(input_ids=x_train_translated_ids)

        x_train_translated = translation_tokenizer.batch_decode(
            x_train_output, 
            skip_special_tokens=True
        )

        x_train_fake_detection_input = fake_detection_tokenizer(
            x_train_translated, 
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(device)

        x_train_fake_detection_output = fake_detection_model(**x_train_fake_detection_input)["logits"]
        x_train_fake_detection_detached_output = x_train_fake_detection_output.detach()

        fake_detection_softmax = nn.Softmax(dim = 1)
        x_train_fake_detection_prediction_probabilities = list(fake_detection_softmax(x_train_fake_detection_detached_output))

        x, y = x_train_fake_detection_prediction_probabilities[0]
        print(x_train_fake_detection_prediction_probabilities)
        
        y_sample = 1 if x < y else 3
        y_pred = np.append(y_pred, y_sample)
        
        print(f"Prediction for sentence {x_train_translated} is {y_sample}")
        
    return y_pred

### Fine-tuning

In [None]:
!pip install --quiet accelerate -U
!pip install --quiet 'transformers[torch]' datasets

In [None]:
from torch.utils.data import Dataset
from datasets import load_metric

In [None]:
class NewsDataset(Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    # convert to pytorch tensors
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item 
    
    def __len__(self):
        return len(self.labels)

In [None]:
acc_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

# we'll log several evaluation metrics as accuracy alone does not show the whole picture 
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = acc_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    return {"acc": acc, "precision": precision, "recall": recall, "f1 score": f1}

In [None]:
seed = 18

# separate data into data and labels 
features = train_df['Content_processed'].tolist()
targets = train_df['Suspicious_Level'].tolist()

# splitting data into training, testing, and validation sets 
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(features, targets, test_size=0.20, random_state=seed)
X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X_train, y_train, test_size=0.10, random_state=seed)

In [None]:
train_encodings = fake_detection_tokenizer(X_train, truncation=True, padding=True)         
test_encodings = fake_detection_tokenizer(X_test, truncation=True, padding=True)      
val_encodings = fake_detection_tokenizer(X_val, truncation=True, padding=True)

In [None]:
train_dataset = NewsDataset(train_encodings, y_train)
test_dataset = NewsDataset(test_encodings, y_test)
val_dataset = NewsDataset(val_encodings, y_val)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=100,
)

In [None]:
trainer = Trainer(
    model=fake_detection_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(test_dataset)

### Evaluation

In [None]:
X_test = test_df['Content_processed'].tolist()
y_test = translate_and_predict_fake(X_test)

In [None]:
test_df['Suspicious_Level'] = y_test

In [None]:
test_df[['MessageId', 'Suspicious_Level']].to_csv('translation_submission.csv', index=False)