In [83]:
import os
import pandas as pd

import re
import time
import random
import string
from typing import Any

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder


import matplotlib.pyplot as plt
%matplotlib inline

In [106]:
RANDOM_SEED = 42
ROOT_DIR = '../datasets/'

In [107]:
def seed_all(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
seed_all(RANDOM_SEED)

In [108]:
train_df = pd.read_csv(os.path.join(ROOT_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(ROOT_DIR, 'test.csv'))

## Naive Bayes

In [109]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk import tokenize

# Split the dataset into train and test sets
train_data, dev_data = train_test_split(train_df, test_size=0.2, random_state=RANDOM_SEED)
test_data = test_df

# Download the Snowball stemmer for Russian language
nltk.download('stopwords')
nltk.download('punkt')

# Create a Snowball stemmer for Russian
stemmer = SnowballStemmer("russian")

def collapse_dots(input):
    # Collapse sequential dots
    input = re.sub("\.+", ".", input)
    # Collapse dots separated by whitespaces
    all_collapsed = False
    while not all_collapsed:
        output = re.sub(r"\.(( )*)\.", ".", input)
        all_collapsed = input == output
        input = output
    return output

def process_text(input):
    if isinstance(input, str):
        input = " ".join(tokenize.sent_tokenize(input))
        input = re.sub(r"http\S+", "", input)
        input = re.sub(r"\n+", ". ", input)
        for symb in ["!", ",", ":", ";", "?"]:
            input = re.sub(rf"\{symb}\.", symb, input)
        input = re.sub("[^а-яА-Яa-zA-Z0-9!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ё]+", " ", input)
        input = re.sub(r"#\S+", "", input)
        input = collapse_dots(input)
        input = input.strip()
        # input = input.lower()
    return input

train_data["Content_processed"] = train_data["Content"].apply(process_text)
dev_data["Content_processed"] = dev_data["Content"].apply(process_text)
test_data["Content_processed"] = test_data["Content"].apply(process_text)

# Tokenize the text using NLTK for Russian language
train_data['tokenized_content'] = train_data['Content_processed'].apply(lambda x: [stemmer.stem(word) for word in word_tokenize(x, language='russian')])
dev_data['tokenized_content'] = dev_data['Content_processed'].apply(lambda x: [stemmer.stem(word) for word in word_tokenize(x, language='russian')])
test_data['tokenized_content'] = test_data['Content_processed'].apply(lambda x: [stemmer.stem(word) for word in word_tokenize(x, language='russian')])

y_train = torch.tensor(train_data['Suspicious_Level'].values)
y_dev = torch.tensor(dev_data['Suspicious_Level'].values)

# Adjust labels to be in the range [0, num_classes - 1]
y_train = y_train - 1
y_dev = y_dev - 1

# Vectorize the tokenized text using CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data['tokenized_content'].apply(lambda x: ' '.join(x)))
X_dev = vectorizer.transform(dev_data['tokenized_content'].apply(lambda x: ' '.join(x)))
X_test = vectorizer.transform(test_data['tokenized_content'].apply(lambda x: ' '.join(x)))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danylovanin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/danylovanin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [110]:
# Create a naive Bayes classifier
class NaiveBayes(nn.Module):
    def __init__(self, num_classes, num_features, alpha=1.0):
        super(NaiveBayes, self).__init__()
        self.num_classes = num_classes
        self.num_features = num_features
        self.alpha = alpha  # Laplace smoothing parameter

        # Parameters for the likelihoods
        self.theta = nn.Parameter(torch.zeros(num_classes, num_features))
        self.bias = nn.Parameter(torch.zeros(num_classes))

    def forward(self, x):
        scores = x @ self.theta.t() + self.bias
        return scores

    def laplace_smoothing(self, x):
        # Apply Laplace smoothing to the likelihoods
        return (x + self.alpha) / (x.sum(dim=1, keepdim=True) + self.alpha * self.num_features)


# Instantiate the model and set hyperparameters
num_classes = 3  # Three classes: Positive, Negative, Neutral
num_features = X_train.shape[1]
laplace_alpha = 1.0  # You can adjust this parameter
model = NaiveBayes(num_classes, num_features, alpha=laplace_alpha)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
X_dev_tensor = torch.tensor(X_dev.toarray(), dtype=torch.float32)

# Create DataLoader for training
train_dataset = TensorDataset(X_train_tensor, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Training loop
num_epochs = 700
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        likelihoods = model.laplace_smoothing(torch.exp(outputs))
        log_likelihoods = torch.log(likelihoods)
        loss = criterion(log_likelihoods, labels)
        loss.backward()
        optimizer.step()

# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    outputs = model(X_dev_tensor)
    likelihoods = model.laplace_smoothing(torch.exp(outputs))
    _, predicted = torch.max(likelihoods, 1)


# Display the counts for each class
class_counts = np.bincount(predicted)
for class_label, count in enumerate(class_counts):
    print(f"Class {class_label}: {count} occurrences")

print(y_dev.numpy())
# accuracy = accuracy_score(y_ev.numpy(), predicted.numpy())
# print(f'Test Accuracy: {accuracy * 100:.2f}%')

Class 0: 86 occurrences
Class 1: 23 occurrences
Class 2: 9 occurrences
[1 1 0 2 2 1 0 1 0 0 0 1 0 0 2 1 0 1 0 0 0 2 0 1 2 0 1 0 0 0 0 2 0 1 0 0 1
 0 0 0 0 1 0 0 1 1 0 2 2 0 2 0 0 0 0 2 0 0 2 0 0 0 0 0 0 1 2 1 0 0 0 0 0 0
 1 0 0 0 0 0 2 1 0 1 0 0 0 2 0 0 2 1 0 1 0 0 1 1 0 0 0 1 0 0 0 1 0 1 0 2 0
 0 0 0 0 0 1 0]


In [111]:
print(len(y_dev.numpy()))
print(len(predicted.numpy()))
# Calculate accuracy
accuracy = accuracy_score(y_dev.numpy(), predicted.numpy())
print(f'Test Accuracy: {accuracy * 100:.2f}%')

118
118
Test Accuracy: 77.97%


In [112]:
f2_score = f1_score(y_dev.numpy(), predicted.numpy(), average='macro')
print(f'Test F1 Score: {f2_score * 100:.2f}%')

Test F1 Score: 63.15%


In [113]:
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)

# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    likelihoods = model.laplace_smoothing(torch.exp(outputs))
    _, predicted = torch.max(likelihoods, 1)

In [114]:
back_to_normal = predicted + 1

In [115]:
test_df['Suspicious_Level'] = back_to_normal

In [116]:
test_df['Suspicious_Level'].value_counts()

Suspicious_Level
1    980
2    139
3     52
Name: count, dtype: int64

In [118]:
test_df[['MessageId', 'Suspicious_Level']].to_csv('naive_bayes_laplace_normalised.csv', index=False)