In [1]:
!pip install --quiet scikit-learn sentence_transformers pandas wordcloud torch spacy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
import os
import pandas as pd
import spacy
import sklearn
import torch
import nltk
import re
import numpy as np

from matplotlib import pyplot as plt
from nltk import tokenize
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split
from wordcloud import WordCloud, STOPWORDS

In [4]:
train_df = pd.read_csv('datasets/train.csv')

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587 entries, 0 to 586
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ChannelName       587 non-null    object
 1   ChannelId         587 non-null    int64 
 2   MessageId         587 non-null    int64 
 3   Date              587 non-null    object
 4   EditDate          573 non-null    object
 5   Content           587 non-null    object
 6   Suspicious_Level  587 non-null    int64 
dtypes: int64(3), object(4)
memory usage: 32.2+ KB


In [6]:
test_df = pd.read_csv('datasets/test.csv')

In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1171 entries, 0 to 1170
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ChannelName  1171 non-null   object
 1   ChannelId    1171 non-null   int64 
 2   MessageId    1171 non-null   int64 
 3   Date         1171 non-null   object
 4   EditDate     884 non-null    object
 5   Content      1171 non-null   object
dtypes: int64(2), object(4)
memory usage: 55.0+ KB


In [8]:
train_df.head(n=3)

Unnamed: 0,ChannelName,ChannelId,MessageId,Date,EditDate,Content,Suspicious_Level
0,boris_rozhin,1101806611,91626,2023-07-08 16:11:34,2023-07-08 16:11:47,Работа наших бойцов к югу от Артемовска. Работ...,2
1,sashakots,1109403194,40853,2023-07-08 16:44:44,2023-07-08 16:44:58,"Анкара нарушила договорённости, отпустив глава...",1
2,swodki,1144180066,280668,2023-07-09 02:00:23,2023-07-09 02:05:53,ЭТО ЕДИНСТВЕННЫЙ СПОСОБ ПОМОЧЬ НАМ! \n\nПополн...,1


### Preprocessing

In [9]:
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer

model = SentenceTransformer("distiluse-base-multilingual-cased-v2")

nlp_ru = spacy.load('ru_core_news_md', disable=["parser", "ner"])

def text_processing_pipeline(texts):
    # Step 1: Tokenization and Lemmatization with spaCy (Russian)
    processed_texts = []
    for text in texts:
        doc = nlp_ru(text)
        processed_text = " ".join([token.lemma_ for token in doc if not token.is_stop])
        processed_texts.append(processed_text)

    # Step 2: TF-IDF Vectorization
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processed_texts)

    # Step 3: Normalize TF-IDF matrix
    tfidf_matrix_normalized = normalize(tfidf_matrix)

    # Step 4: Sentence Embeddings with SentenceTransformer
    sentence_embeddings = []
    for i in range(len(texts)):
        # Get TF-IDF weights for each word in the processed text
        tfidf_weights = tfidf_matrix_normalized[i].toarray().flatten()

        # Encode the sentence using SentenceTransformer
        sentence_embedding = model.encode([texts[i]])[0]

        # Weighted sum of word embeddings based on TF-IDF weights
        weighted_sum_embedding = np.sum(sentence_embedding * tfidf_weights[:, None], axis=0)
        sentence_embeddings.append(weighted_sum_embedding)

    return np.vstack(sentence_embeddings)

In [10]:
train_df.head(n=3)

Unnamed: 0,ChannelName,ChannelId,MessageId,Date,EditDate,Content,Suspicious_Level
0,boris_rozhin,1101806611,91626,2023-07-08 16:11:34,2023-07-08 16:11:47,Работа наших бойцов к югу от Артемовска. Работ...,2
1,sashakots,1109403194,40853,2023-07-08 16:44:44,2023-07-08 16:44:58,"Анкара нарушила договорённости, отпустив глава...",1
2,swodki,1144180066,280668,2023-07-09 02:00:23,2023-07-09 02:05:53,ЭТО ЕДИНСТВЕННЫЙ СПОСОБ ПОМОЧЬ НАМ! \n\nПополн...,1


### Visualizations

In [11]:
if os.path.exists('train_news_embeddings.npy'):
    train_news_embeddings = np.load('train_news_embeddings.npy')
else:
    train_news_embeddings = text_processing_pipeline(train_df['Content'])
    np.save('train_news_embeddings.npy', train_news_embeddings)

In [12]:
if os.path.exists('test_news_embeddings.npy'):
    test_news_embeddings = np.load('test_news_embeddings.npy')
else:
    test_news_embeddings = text_processing_pipeline(test_df['Content'])
    np.save('test_news_embeddings.npy', test_news_embeddings)

### Implementation

#### Similarity search

In [13]:
# Number of folds (K)
num_folds = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

X = train_df.drop('Suspicious_Level', axis=1)  # Adjust 'target_column' to the actual column name containing your labels
y = train_df['Suspicious_Level']

# Initialize a counter for the fold number
fold_number = 1

f1_score_folds = []

for train_index, val_index in skf.split(X, y):
    X_train, X_val = (
        train_news_embeddings[train_index],
        train_news_embeddings[val_index]
    )    
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    y_pred = np.array([])
    for x_val in X_val:
        max_cosine_score, max_idx = None, None
        cosine_scores = torch.as_tensor(np.array([util.cos_sim(x_val, x_train)[0] for x_train in X_train])).reshape(-1)
        top_results = torch.topk(cosine_scores, k=1)
        max_idx = top_results[1].item()
        y_pred = np.append(y_pred, y.iloc[max_idx])
    
    # Evaluate the model (e.g., using f1 score)
    f1_score = sklearn.metrics.f1_score(y_val, y_pred, average='macro')
    f1_score_folds.append(f1_score)
    
    # Print the fold number and silhouette score
    print(f"Fold {fold_number} - F1 Score: {f1_score}")
    
    # Compute confusion matrix
    print(sklearn.metrics.classification_report(y_pred, y_val))

    # Increment the fold number
    fold_number += 1
    
f1_mean_score = sum(f1_score_folds) / num_folds
print(f"F1 Score: {f1_mean_score}")

Fold 1 - F1 Score: 0.34558395032242656
              precision    recall  f1-score   support

         1.0       0.70      0.67      0.68        81
         2.0       0.11      0.12      0.11        25
         3.0       0.23      0.25      0.24        12

    accuracy                           0.51       118
   macro avg       0.35      0.35      0.35       118
weighted avg       0.53      0.51      0.52       118

Fold 2 - F1 Score: 0.36275907830808346
              precision    recall  f1-score   support

         1.0       0.66      0.66      0.66        77
         2.0       0.36      0.32      0.34        31
         3.0       0.08      0.10      0.09        10

    accuracy                           0.53       118
   macro avg       0.37      0.36      0.36       118
weighted avg       0.53      0.53      0.53       118

Fold 3 - F1 Score: 0.34252873563218383
              precision    recall  f1-score   support

         1.0       0.59      0.65      0.62        69
         2.0

### Submission

#### Test

In [14]:
X_train, X_test = train_news_embeddings, test_news_embeddings
y_train = train_df['Suspicious_Level']

y_pred = np.array([])
for x_test in X_test:
    cosine_scores = torch.as_tensor(np.array([util.cos_sim(x_train, x_test)[0] for x_train in X_train])).reshape(-1)
    top_results = torch.topk(cosine_scores, k=1)
    max_idx = top_results[1].item()
    y_pred = np.append(y_pred, y_train.iloc[max_idx])

In [15]:
y_pred

array([3., 1., 1., ..., 1., 2., 2.])

In [16]:
test_df['Suspicious_Level'] = y_pred.astype(int)

In [17]:
test_df.head(n=25)

Unnamed: 0,ChannelName,ChannelId,MessageId,Date,EditDate,Content,Suspicious_Level
0,ukraina_ru,1175084215,168641,2023-09-11 16:57:36,2023-09-11 16:57:50,"⚡️Администрация Байдена близка к тому, чтобы о...",3
1,bear007,1505866568,36569,2023-09-11 14:52:01,2023-09-11 14:52:12,РИА Новости публикует первые кадры с места гиб...,1
2,zakharprilepin,1217080686,19697,2023-09-11 08:26:09,,Вице-спикер Госдумы Петр Толстой - о сроках ок...,1
3,denazi_ua,1783083983,49296,2023-09-11 12:30:37,2023-09-11 12:32:55,В Германии не осталось действующих специалисто...,3
4,spletnicca,1287596810,11784,2023-09-11 08:47:19,,Военные медики рассказали правду о войне и её ...,1
5,openukraine,1200595041,48960,2023-09-11 17:30:32,2023-09-11 17:30:38,"‼️🇺🇦""Боритесь и поборете"": Украинцы призывают ...",3
6,ukraina_ru,1175084215,168567,2023-09-11 07:12:09,2023-09-11 07:12:18,"⚡️Украинские женщины-медики, которые не встану...",1
7,nach_shtabu,1202009667,24176,2023-09-11 12:33:40,2023-09-11 12:35:53,"СБУ с поличным задержала двух чиновников, кото...",3
8,stranaua,1092413834,122178,2023-09-11 03:44:16,2023-09-11 03:53:09,Днепропетровскую область с утра атаковали раке...,1
9,solovievlive,1315735637,208223,2023-09-11 03:57:46,2023-09-11 03:58:01,❗️ВСУ за ночь выпустили 35 снарядов по населен...,2


In [18]:
test_df[['MessageId', 'Suspicious_Level']].to_csv('test_embeddings.csv', index=False)

#### Train

In [35]:
X_train = torch.as_tensor(train_news_embeddings)

y_pred = np.array([])
for x_test in X_train:
    cosine_scores = torch.as_tensor(
        np.array([
            util.cos_sim(x_train, x_test)[0] 
            for x_train in X_train 
            if not torch.all(x_train.eq(x_test)).item()
        ])
    ).reshape(-1)
    top_results = torch.topk(cosine_scores, k=1)
    max_idx = top_results[1].item()
    y_pred = np.append(y_pred, y_train.iloc[max_idx])
train_df['Suspicious_Level'] = y_pred.astype(int)

In [36]:
train_df[['MessageId', 'Suspicious_Level']].to_csv('train_embeddings.csv', index=False)