In [1]:
!pip install --quiet transformers sacremoses


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [2]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import nltk
import re

from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
from sklearn.model_selection import StratifiedKFold
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk import tokenize

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [4]:
train_df = pd.read_csv('../datasets/train.csv')
test_df = pd.read_csv('../datasets/test.csv')

In [5]:
X_train, y_train = train_df.iloc[:10]['Content'].values.tolist(), train_df.iloc[:10]['Suspicious_Level']
X_test = test_df.iloc[:10]['Content'].values.tolist()

### Processing

In [6]:
# import spacy

# nlp_ru = spacy.load('ru_core_news_md', disable=["parser", "ner"])

# def text_processing(texts):
#     processed_texts = []
#     for text in texts:
#         doc = nlp_ru(text)
#         processed_text = " ".join([token.lemma_ for token in doc if not token.is_stop])
#         processed_texts.append(processed_text)
#     return processed_texts

In [7]:
# Download the Snowball stemmer for Russian language
nltk.download('stopwords')
nltk.download('punkt')

# Create a Snowball stemmer for Russian
stemmer = SnowballStemmer("russian")

def collapse_dots(input):
    # Collapse sequential dots
    input = re.sub("\.+", ".", input)
    # Collapse dots separated by whitespaces
    all_collapsed = False
    while not all_collapsed:
        output = re.sub(r"\.(( )*)\.", ".", input)
        all_collapsed = input == output
        input = output
    return output

def process_text(input):
    if isinstance(input, str):
        input = " ".join(tokenize.sent_tokenize(input))
        input = re.sub(r"http\S+", "", input)
        input = re.sub(r"\n+", ". ", input)
        for symb in ["!", ",", ":", ";", "?"]:
            input = re.sub(rf"\{symb}\.", symb, input)
        input = re.sub("[^а-яА-Яa-zA-Z0-9!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ё]+", " ", input)
        input = re.sub(r"#\S+", "", input)
        input = collapse_dots(input)
        input = input.strip()
        # input = input.lower()
    return input

train_df["Content_processed"] = train_df["Content"].apply(process_text)
test_df["Content_processed"] = test_df["Content"].apply(process_text)

# Tokenize the text using NLTK for Russian language
train_df['Content_tokenized'] = train_df['Content_processed'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x, language='russian')]))
test_df['Content_tokenized'] = test_df['Content_processed'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x, language='russian')]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danorel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/danorel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Translator & Fake detector

In [8]:
mname = "Helsinki-NLP/opus-mt-ru-en"
translation_tokenizer = AutoTokenizer.from_pretrained(mname)
translation_model = AutoModelForSeq2SeqLM.from_pretrained(mname)

# mname = "facebook/wmt19-en-ru"
# translation_tokenizer = FSMTTokenizer.from_pretrained(mname)
# translation_model = FSMTForConditionalGeneration.from_pretrained(mname)

### Fake detection

In [9]:
fake_detection_tokenizer = AutoTokenizer.from_pretrained("vikram71198/distilroberta-base-finetuned-fake-news-detection")
fake_detection_model = AutoModelForSequenceClassification.from_pretrained("vikram71198/distilroberta-base-finetuned-fake-news-detection")

In [10]:
def translate_and_predict_fake(X_train):
    y_pred = np.array([])
    
    for x_train in tqdm(X_train):
        x_train_translated_ids = translation_tokenizer(
            x_train,
            max_length=512,
            truncation=True,
            return_tensors="pt"
        ).input_ids

        x_train_output = translation_model.generate(input_ids=x_train_translated_ids)

        x_train_translated = translation_tokenizer.batch_decode(
            x_train_output, 
            skip_special_tokens=True
        )

        x_train_fake_detection_input = fake_detection_tokenizer(
            x_train_translated, 
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )

        x_train_fake_detection_output = fake_detection_model(**x_train_fake_detection_input)["logits"]
        x_train_fake_detection_detached_output = x_train_fake_detection_output.detach()

        fake_detection_softmax = nn.Softmax(dim = 1)
        x_train_fake_detection_prediction_probabilities = list(fake_detection_softmax(x_train_fake_detection_detached_output).detach().numpy())

        x, y = x_train_fake_detection_prediction_probabilities[0]
        print(x_train_fake_detection_prediction_probabilities)
        
        y_sample = 1 if x < y else 3
        y_pred = np.append(y_pred, y_sample)
        
        print(f"Prediction for sentence {x_train_translated} is {y_sample}")
        
    return y_pred

### Cross-validation

In [11]:
num_folds = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

In [12]:
X = train_df.iloc[50:100]['Content_processed']
y = train_df.iloc[50:100]['Suspicious_Level']

In [13]:
fold_number = 1
f1_score_folds = []

for train_index, val_index in skf.split(X, y):
    X_train, X_val = X.iloc[train_index].values.tolist(), X.iloc[val_index].values.tolist()
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    y_pred = translate_and_predict_fake(X_train)
    
    f1_score = sklearn.metrics.f1_score(y_train, y_pred, average='macro')
    f1_score_folds.append(f1_score)
    
    print(f"Fold {fold_number} - F1 Score: {f1_score}")
    print(sklearn.metrics.classification_report(y_pred, y_val))

    fold_number += 1
    
f1_mean_score = sum(f1_score_folds) / num_folds
print(f"F1 Score: {f1_mean_score}")

  2%|██▋                                                                                                        | 1/40 [00:01<00:41,  1.05s/it]

Prediction for sentence ['Vladimir Rogov reports that in the Rostov region, air defences have successfully struck two air targets.'] is 1


  5%|█████▎                                                                                                     | 2/40 [00:02<00:47,  1.24s/it]

Prediction for sentence ['Kreminna fights: the big ones destroy the enemy with C-60 anti-aircrafts. Fighters travel several times a day to perform tasks and destroy targets that are known by drone operators / TASS/.'] is 1


  8%|████████                                                                                                   | 3/40 [00:03<00:42,  1.14s/it]

Prediction for sentence ['The Kremlin claimed that Ankara had broken the agreements by releasing the heads of Azov to Ukraine, but no one informed Russia, declared the RIA of Peskov News.'] is 1


 10%|██████████▋                                                                                                | 4/40 [00:06<01:08,  1.92s/it]

Prediction for sentence ["Why is it important to support the Russian army? The events of the last few days could seriously undermine the approval of the country's military leadership, but the army is not only the superiors, it's the ordinary guys who were taken out of their families, and the real patriots of the country who were sent by volunteers. The collapse of the army could have catastrophic consequences, which we can see from our history. It's not acceptable. So I decided to make cards that explain this position in an easy way. Help the Russians!"] is 1


 12%|█████████████▍                                                                                             | 5/40 [00:13<02:04,  3.56s/it]

Prediction for sentence ['The large-scale exodus of the population of Ukraine will lead to problems in restoring the economy after the end of hostilities in Ukraine. For example, according to information from the public organization EasyBusines and the Centre for Economic Recovery, the failure to return forced migrants to Ukraine will cost only one third of the forced migrants to return to the country in 10 years (to mention also the $45 billion lost to Ukraine). Moreover, according to preliminary estimates, only one third of the displaced migrants will return to Ukraine today (from 3.8 million to 4.7 million forced migrants, of whom approximately 1.4 million are of working age). At best, more and more Ukrainians are finding employment abroad, sending their children to local schools and kindergartens, receiving vocational training to find work there. However, even if most migrants return to grow at a rate of 7 per cent a year, Ukraine needs to attract between 3.1 and 4.5 million worker

 15%|████████████████                                                                                           | 6/40 [00:23<03:20,  5.91s/it]

Prediction for sentence ['The report of the Ministry of Defence of the Russian Federation on the progress of the special military operation (as of 9 July 2023) Part 2. On the Kupjan track, active actions by the Western Group of Forces, fire by operational tactical and army aircraft, fire by artillery, damage to the enemy &apos; s living force and equipment in the areas of Novomlysk, Kamenka, Olshana, Kislovan, Timkovka and Borškiv Kharkiv region; destroyed up to 20 Ukrainian military personnel, two vehicles, a self-propelled artillery installation by Gvazdik, as well as an ammunition depot of the 127th territorial defence brigade in the area of the city of Storica, Harkiv region; on the Herson direction of fire per day destroyed up to 60 Ukrainian soldiers, one tank, defence posts of eight cars, self-propelled Gaubica Gordika, as well as an artillery unit by K777 United States production; on the operational and military air defence brigade, and on the basis of fire by the forces of the

 18%|██████████████████▋                                                                                        | 7/40 [00:36<04:32,  8.27s/it]

Prediction for sentence ["Military expert, author of Pancerwafli/Panzerwaffle. The story of the 155-mm supply of cluster munitions to Ukraine is politically more interesting than the military. Indeed, the fundamentally new military capabilities are not giving, and in fact this is such a time, which Americans are making quite openly. So the US President's National Security Adviser, Jake Sullivan, said that the ammunition would serve as a bridge again until the production of conventional munitions can be built up. The problem of the depletion of conventional ammunition stocks has also been confirmed by Byden. At a press conference and in an interview with CNN, the President of the United States said: this war is based on ammunition. And they [the Ukraine] are running out. And they are running away. At the same time, the fact that it is not too pleasant, forced and even dangerous for the troops to re-enact it."] is 1


 20%|█████████████████████▍                                                                                     | 8/40 [00:37<03:08,  5.90s/it]

Prediction for sentence ["Friends, help us! We're going to publish this message and hope that at least a few people will find themselves concerned!"] is 1


 22%|████████████████████████                                                                                   | 9/40 [00:46<03:30,  6.80s/it]

Prediction for sentence ["Savva Fedoseev/Plurinational: In Chechnya, police officers severely tortured and robbed military servicemen under contract. Application to open a criminal case that merely described the heinous details of the incident at the end of May 2023. The torturers were subjected to at least three HF 65384. All of them would not retrace (read the creaks), bring a small passage: the Lieutenant Colonel entered the room, who came to the military unit and presented himself to the station chief. When he came in, he said that he would show me 95 years, took me behind the back of the head, put me away so that my body was under 90 degrees. Then another hand would hit me hard in the back of the head. When he had finished, he would say to his staff that he had gone into the room, that he had gone to prayer, and if there were no results, he would take us back. We think it's perfectly understandable that it's all human."] is 1


 25%|██████████████████████████▌                                                                               | 10/40 [00:46<02:23,  4.79s/it]

Prediction for sentence ['A minute of humor.'] is 1


 28%|█████████████████████████████▏                                                                            | 11/40 [00:57<03:16,  6.79s/it]

Prediction for sentence ["It's clear that our task is to kill their personnel. But I don't see them before the end of the 2nd decade of August. I think they're gonna go to that place until 15-20 August. Then they're gonna fall down, and by autumn they're gonna die. Someone's gonna wait that the West is finally gonna talk to us and with the bar hand is gonna let us go? Who's gonna let us go? Who's gonna let us go? Who's go? Who's gonna do it? Who's gonna do it? Who's gonna do it? Who's gonna do it? Who's gonna do it? Who's gonna do it? Who's gonna do it? Who's gonna do it? Who's gonna do it? Who's gonna do it?"] is 1


 30%|███████████████████████████████▊                                                                          | 12/40 [01:00<02:32,  5.44s/it]

Prediction for sentence ['On 8 July, as at 8 p.m., additional information was received on the operational lines of the PRC on civilian casualties in the cities of the Republic: Pantelmonica: on the street of Stepnoy, 9 men were injured in 1981. The impact of the shelling continues to be reported.'] is 3


 32%|██████████████████████████████████▍                                                                       | 13/40 [01:04<02:17,  5.10s/it]

Prediction for sentence ['Ukraine launched a missile attack on the Crimean Bridge. Our air defences have not been damaged. Crimeans have reported several explosions in the air (from three to six in various sources). Sergei Aksenov has reported the shooting down of a cruise missile. The movement on the Crimean Bridge has been temporarily suspended.'] is 1


 35%|█████████████████████████████████████                                                                     | 14/40 [01:05<01:37,  3.75s/it]

Prediction for sentence ['Erdogan said he looked forward to meeting Putin in August.'] is 3


 38%|███████████████████████████████████████▊                                                                  | 15/40 [01:14<02:13,  5.36s/it]

Prediction for sentence ['Ukraine &apos; s allies continue to dispose of military equipment in order to rearrange the army with Western specimens. The Mi-24 helicopters that the Poles secretly assigned to the U.S.A. should have written off, as well as all weapons to them: it is old and does not fit NATO standards. The MI-24 in the Polish army should be replaced by American Apache, which should be equipped with nearly 100 grand. Although the Poles have not yet received a single copy. Even the eight used vehicles promised by Washington in May as a bonus. They are likely to be granted as a discount for a serious delay in the supply of F-35A Lightning II fighter aircraft. When the Apache still reaches Poland, they are scheduled to be handed over to the 18th Mechanized Division responsible for the defence of the Suvalk Corridor by the NATO Eastern Flang.'] is 1


 40%|██████████████████████████████████████████▍                                                               | 16/40 [01:34<03:54,  9.78s/it]

Prediction for sentence ['It\'s not less than ten minutes to prepare an Italian 120-mm mortar system for a 1963-style mortar system, but it\'s going to take at least ten minutes to prepare the Italian 120-mm mortar system for a 1963-style mortar system. It\'s going to take at least ten minutes to prepare the electrocutions of a 1963-type mortar. The infantry, which is in the trenches about three kilometres ahead, asks "to shoot fast because the Russians are attacking." Colonel Kohny Yandulski (Serhii Iandulski), 37 years, the commander of the first battalion of the 10-year-old mortar brigade to the north of the United States to the north of Bahmut in the direction of the disputed city of Soledar, thanks to the images transmitted in a continuous manner by the cameras of the two small mavic-3 drones, half an hour ago saw the arrival of the three armoured vehicles, from which the mobile groups of the enemy came forward, the number of the forces that immediately started the offensive. It\'

 42%|█████████████████████████████████████████████                                                             | 17/40 [01:48<04:15, 11.09s/it]

Prediction for sentence ['There are more and more new parts in the offensive that have been so carefully hoarded and prepared by the U.S.A. The offensives of our forces under Kupansk and Kremna have forced new brigades to be deployed there. Similarly, the futile fear of a bunch of brigades for two months around the small Blade has led to the sending of a fresh brigade there (but not to say that this changed the course of battle). Comparing the so-called leaked report (it is possible to look at the approximate equipment of the brigades), which allegedly revealed the U.S.S. plans for the offensive - all the new brigades are represented in the image and which of them are already engaged (the location can be seen). The offensive in the south. 33 meh. Brigade. 37 machetes. 47 meh. Brigade. 21 brigade. 47 art. brigade. 32 m. brigade. 31 m. brigade. 23 m. brigade.'] is 1


 45%|███████████████████████████████████████████████▋                                                          | 18/40 [01:50<03:04,  8.38s/it]

Prediction for sentence ['The cluster munitions that Washington sent to Ukraine do not explode in 14% of cases.'] is 1


 48%|██████████████████████████████████████████████████▎                                                       | 19/40 [01:53<02:23,  6.84s/it]

Prediction for sentence ['"Our grandfathers have taught us how to burn German tanks. The 70th regiment of the 58th Army, the BARS-1, which is part of this regiment, and the drone operators of the "Caribbean Wolfs" team meet the Germans," comments on the video by D. Rogozin.'] is 3


 50%|█████████████████████████████████████████████████████                                                     | 20/40 [01:55<01:47,  5.39s/it]

Prediction for sentence ['Dmitry Steshin laughs at Ukrainian air defences, and the idea is an interesting one that significantly increases the number of shootings.'] is 1


 52%|███████████████████████████████████████████████████████▋                                                  | 21/40 [02:07<02:18,  7.28s/it]

Prediction for sentence ["This is the first time we've had an autobot project that looks like a working version. We've given the guys money (and more specifically, one person has paid for them, receipts and scripts are the same for law enforcement). Five months these people (two people) tell us daily strange things, they don't have enough, they don't have enough, we don't have enough, we don't have enough, we don't have enough money to report from Zaporiza. 1. There's a report to the law enforcement agencies on fraud. 2. These guys had only 100 followers before we gave them information, and they're strongly advised not to give them money. 3. We're not gonna leave this situation alone, and we're gonna thank God we've got those people for the first time and hope for the last time. Alexander, who really liked this project and who paid them for the battalion, who served the same situation and supported our decision to the law enforcement agencies."] is 1


 55%|██████████████████████████████████████████████████████████▎                                               | 22/40 [02:16<02:19,  7.74s/it]

Prediction for sentence ["People, there's a request! A woman caring for an injured son at Burdenco Hospital, Resuscitation N 56, Hospital Square 3, Corp. 21, Moscow reported that there are boys lying next to her son, wounded, unconscious; Demin Alexey 1986, Gluško Victor, if I'm not mistaken 1996, Milehin Nikolai 1983 or 85 gr. Maybe someone is looking for these guys from different groups, chat rooms or personal contacts. Please forward this information to their contacts, Sarafan radio often works best and is very fast, thank you for your indifference, maybe it's someone's son, husband or brother. Telephone +7 916 601-67, Olga. +7 (499) 263-55-44 resuscitation, 24 hours a day. +7 (499) 263-55-55 resuscitation, 24 hours a day."] is 1


 57%|████████████████████████████████████████████████████████████▉                                             | 23/40 [02:22<02:04,  7.30s/it]

Prediction for sentence ["And you know who's a truly underrated organization that's just doing a very important job in the current war? Sisters of Mercy. Orthodox young girls, whom any soldier who's in the hospital speaks with love and admiration for having been shot in battle, are the ones who will feed you, give you clean clothes and, in a spiritual sense, wash your feet just for being breast-fed to protect your stepfather. If you're reading this, please accept sincere thanks from me and from every wounded person you've helped. You are truly beautiful girls and women whose beauty is not only of appearance but also of glorious deeds. Thank you, sisters, you are our pride."] is 1


 60%|███████████████████████████████████████████████████████████████▌                                          | 24/40 [02:24<01:33,  5.85s/it]

Prediction for sentence ['Western junk on the Zaporozhsky track. The U.S.U. is openly afraid to get into foreign technology (Leopards, Bradley). "As soon as the Russian Federation sees it, everything is coming" from the testimony of Ukrainian prisoners.'] is 3


 62%|██████████████████████████████████████████████████████████████████▎                                       | 25/40 [02:26<01:09,  4.63s/it]

Prediction for sentence ['Today is a very sad day for us.'] is 1


 65%|████████████████████████████████████████████████████████████████████▉                                     | 26/40 [02:27<00:49,  3.57s/it]

Prediction for sentence ["Friends, help us! We're going to publish this message and hope that at least a few people will find themselves concerned!"] is 1


 68%|███████████████████████████████████████████████████████████████████████▌                                  | 27/40 [02:44<01:35,  7.37s/it]

Prediction for sentence ["An attempt to strike the bridge could be linked to pressure on the Russian leadership to extend the grain deal, which ends on 17 July. But this option is unlikely, because Ukrainian units have been instructed not to work in this direction for the time being. In this situation, the way in which it is presented in the Ukrainian media, especially in the context of the return of the leaders of Azov to Ukraine: supposedly a new reality for Crimea and Russia as a whole. This attack, like in the Rostov region, is more like a part of a strategic information operation whose purpose is to raise the moral and psychological state of the members of the US before a new phase of the offensive. It is no secret that the loss of the U.S.U.'s links involved in the front-line fighting is enormous. Some of the ones on the Zaporizsk track have already been sent to pre-assed. These strikes should convince both Ukrainian units and mercenaries that they are winning. This is a plan to 

 70%|██████████████████████████████████████████████████████████████████████████▏                               | 28/40 [02:49<01:21,  6.83s/it]

Prediction for sentence ["It doesn't take much for us to survive and win our common cause. Our common cause doesn't ask for huge sums, we don't need anyone to work hard and make serious contributions. Great things always start small. Let your donation be at least 10 or 50 rubles. But if a few concerned people come together, it'll already be a tangible support. SBERBANK map: 4276160925483621. Don't get past it. A couple dozen rubles won't even buy bread, it won't make you poor, but it'll give you a lot of support for all of us."] is 1


 70%|██████████████████████████████████████████████████████████████████████████▏                               | 28/40 [02:52<01:14,  6.17s/it]


KeyboardInterrupt: 