In [14]:
import os
import pandas as pd

import re
import time
import random
import string
from typing import Any

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder


import matplotlib.pyplot as plt
%matplotlib inline

In [15]:
RANDOM_SEED = 42
ROOT_DIR = '../datasets/'

In [16]:
def seed_all(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
seed_all(RANDOM_SEED)

In [17]:
train_df = pd.read_csv(os.path.join(ROOT_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(ROOT_DIR, 'test.csv'))

## Naive Bayes

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk import tokenize

# Split the dataset into train and test sets

train_df = train_df.drop_duplicates(subset='Content', keep='first')

train_data, dev_data = train_test_split(train_df, test_size=0.2, random_state=RANDOM_SEED)
test_data = test_df

# Download the Snowball stemmer for Russian language
nltk.download('stopwords')
nltk.download('punkt')

# Create a Snowball stemmer for Russian
stemmer = SnowballStemmer("russian")

def collapse_dots(input):
    # Collapse sequential dots
    input = re.sub("\.+", ".", input)
    # Collapse dots separated by whitespaces
    all_collapsed = False
    while not all_collapsed:
        output = re.sub(r"\.(( )*)\.", ".", input)
        all_collapsed = input == output
        input = output
    return output

def process_text(input):
    if isinstance(input, str):
        input = " ".join(tokenize.sent_tokenize(input))
        input = re.sub(r"http\S+", "", input)
        input = re.sub(r"\n+", ". ", input)
        for symb in ["!", ",", ":", ";", "?"]:
            input = re.sub(rf"\{symb}\.", symb, input)
        input = re.sub("[^а-яА-Яa-zA-Z0-9!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ё]+", " ", input)
        input = re.sub(r"#\S+", "", input)
        input = collapse_dots(input)
        input = input.strip()
        input = input.lower()
    return input

# Lemmatization function
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

train_data["Content_processed"] = train_data["Content"].apply(process_text)
dev_data["Content_processed"] = dev_data["Content"].apply(process_text)
test_data["Content_processed"] = test_data["Content"].apply(process_text)

train_data["Content_lemmatized"] = train_data["Content_processed"].apply(process_text)
dev_data["Content_lemmatized"] = dev_data["Content_processed"].apply(process_text)
test_data["Content_lemmatized"] = test_data["Content_processed"].apply(process_text)

# Tokenize the text using NLTK for Russian language
train_data['tokenized_content'] = train_data['Content_lemmatized'].apply(lambda x: [stemmer.stem(word) for word in word_tokenize(x, language='russian')])
dev_data['tokenized_content'] = dev_data['Content_lemmatized'].apply(lambda x: [stemmer.stem(word) for word in word_tokenize(x, language='russian')])
test_data['tokenized_content'] = test_data['Content_lemmatized'].apply(lambda x: [stemmer.stem(word) for word in word_tokenize(x, language='russian')])

y_train = torch.tensor(train_data['Suspicious_Level'].values)
y_dev = torch.tensor(dev_data['Suspicious_Level'].values)

# Adjust labels to be in the range [0, num_classes - 1]
y_train = y_train - 1
y_dev = y_dev - 1

# Vectorize the tokenized text using CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data['tokenized_content'].apply(lambda x: ' '.join(x)))
X_dev = vectorizer.transform(dev_data['tokenized_content'].apply(lambda x: ' '.join(x)))
X_test = vectorizer.transform(test_data['tokenized_content'].apply(lambda x: ' '.join(x)))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danorel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/danorel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Create a naive Bayes classifier
class NaiveBayes(nn.Module):
    def __init__(self, num_classes, num_features, alpha=1.0):
        super(NaiveBayes, self).__init__()
        self.num_classes = num_classes
        self.num_features = num_features
        self.alpha = alpha  # Laplace smoothing parameter

        # Parameters for the likelihoods
        self.theta = nn.Parameter(torch.zeros(num_classes, num_features))
        self.bias = nn.Parameter(torch.zeros(num_classes))

    def forward(self, x):
        scores = x @ self.theta.t() + self.bias
        return scores

    def laplace_smoothing(self, x):
        # Apply Laplace smoothing to the likelihoods
        return (x + self.alpha) / (x.sum(dim=1, keepdim=True) + self.alpha * self.num_features)


# Instantiate the model and set hyperparameters
num_classes = 3  # Three classes: Positive, Negative, Neutral
num_features = X_train.shape[1]
laplace_alpha = 1.0  # You can adjust this parameter
model = NaiveBayes(num_classes, num_features, alpha=laplace_alpha)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
X_dev_tensor = torch.tensor(X_dev.toarray(), dtype=torch.float32)

# Create DataLoader for training
train_dataset = TensorDataset(X_train_tensor, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Training loop
num_epochs = 700
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        likelihoods = model.laplace_smoothing(torch.exp(outputs))
        log_likelihoods = torch.log(likelihoods)
        loss = criterion(log_likelihoods, labels)
        loss.backward()
        optimizer.step()

# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    outputs = model(X_dev_tensor)
    likelihoods = model.laplace_smoothing(torch.exp(outputs))
    _, predicted = torch.max(likelihoods, 1)


# Display the counts for each class
class_counts = np.bincount(predicted)
for class_label, count in enumerate(class_counts):
    print(f"Class {class_label}: {count} occurrences")

print(y_dev.numpy())
# accuracy = accuracy_score(y_ev.numpy(), predicted.numpy())
# print(f'Test Accuracy: {accuracy * 100:.2f}%')

In [None]:
print(len(y_dev.numpy()))
print(len(predicted.numpy()))
# Calculate accuracy
accuracy = accuracy_score(y_dev.numpy(), predicted.numpy())
print(f'Test Accuracy: {accuracy * 100:.2f}%')

In [None]:
f2_score = f1_score(y_dev.numpy(), predicted.numpy(), average='macro')
print(f'Test F1 Score: {f2_score * 100:.2f}%')

In [None]:
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)

# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    likelihoods = model.laplace_smoothing(torch.exp(outputs))
    _, predicted = torch.max(likelihoods, 1)

In [None]:
back_to_normal = predicted + 1

In [None]:
test_df['Suspicious_Level'] = back_to_normal

In [None]:
test_df['Suspicious_Level'].value_counts()

In [None]:
test_df[['MessageId', 'Suspicious_Level']].to_csv('naive_bayes_laplace_normalised.csv', index=False)

In [None]:
prev_df = pd.read_csv('naive_bayes_laplace_normalised_prev.csv')

In [None]:
test_df[test_df['Suspicious_Level'] != prev_df['Suspicious_Level']]['Suspicious_Level'].value_counts()

## SVM

In [50]:
train_df = pd.read_csv(os.path.join(ROOT_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(ROOT_DIR, 'test.csv'))

In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Normalize the text
train_df["Content_processed"] = train_df["Content"].apply(process_text)
test_df["Content_processed"] = test_df["Content"].apply(process_text)

# Assuming df has 'Content' and 'Label' columns
# X is the feature (Content), y is the target (Label)
X = train_df['Content_processed']
y = train_df['Suspicious_Level']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=999)  # You can adjust max_features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Create and train the Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_tfidf)




In [52]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy * 100:.2f}%')


f1_svm_score = f1_score(y_test, y_pred, average='macro')
print(f'Test F1 Score: {f1_svm_score * 100:.2f}%')

# Display classification report
print('Classification Report:\n', classification_report(y_test, y_pred))

Test Accuracy: 76.27%
Test F1 Score: 62.56%
Classification Report:
               precision    recall  f1-score   support

           1       0.77      0.96      0.86        75
           2       0.68      0.48      0.57        27
           3       0.83      0.31      0.45        16

    accuracy                           0.76       118
   macro avg       0.76      0.58      0.63       118
weighted avg       0.76      0.76      0.74       118



In [53]:
y_pred

array([1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2,
       1, 2, 3, 1, 3, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1,
       2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 3,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 3,
       1, 1, 1, 1, 1, 1, 2, 1])

In [54]:
X_submit = test_df["Content_processed"]
X_submit_tfidf = tfidf_vectorizer.transform(X_submit)
y_submit = svm_model.predict(X_submit_tfidf)

In [55]:
test_df['Suspicious_Level'] = y_submit

In [56]:
test_df['Suspicious_Level'].describe()

count    1171.000000
mean        1.073442
std         0.282970
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         3.000000
Name: Suspicious_Level, dtype: float64

In [57]:
test_df['Suspicious_Level'].value_counts()

Suspicious_Level
1    1092
2      72
3       7
Name: count, dtype: int64

In [59]:
test_df[['MessageId', 'Suspicious_Level']].to_csv('svm_999.csv', index=False)

## LGBM

In [72]:
train_df = pd.read_csv(os.path.join(ROOT_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(ROOT_DIR, 'test.csv'))

In [73]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

# Assuming train_df is your DataFrame
X = train_df['Content']
y = train_df['Suspicious_Level'] - 1  # Adjust labels to be in the range [0, num_classes - 1]

# Tokenization and text preprocessing steps go here
X_accuracy = test_df['Content']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

X_accuracy_tfidf = tfidf_vectorizer.transform(X_accuracy)

# Define and train the LGBM model
lgbm_params = {
    'objective': 'multiclass',
    'num_class': 3,  # Number of classes
    'boosting_type': 'gbdt',
    'metric': 'multi_logloss',
}

train_data = lgb.Dataset(X_train_tfidf, label=y_train)
val_data = lgb.Dataset(X_val_tfidf, label=y_val, reference=train_data)

num_round = 150  # You can adjust the number of boosting rounds

lgb_model = lgb.train(lgbm_params, train_data, num_round, valid_sets=[val_data])

# Make predictions on the validation set
y_pred = lgb_model.predict(X_val_tfidf, num_iteration=lgb_model.best_iteration)
y_pred_class = [int(label) for label in y_pred.argmax(axis=1)]


# Evaluate the model
accuracy = accuracy_score(y_val, y_pred_class)
print(f'Validation Accuracy: {accuracy * 100:.2f}%')

f1_lgbm_score = f1_score(y_val, y_pred_class, average='macro')
print(f'Validation F1 Score: {f1_lgbm_score * 100:.2f}%')

# Display classification report
print('Classification Report:\n', classification_report(y_val, y_pred_class))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008548 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2778
[LightGBM] [Info] Number of data points in the train set: 469, number of used features: 182
[LightGBM] [Info] Start training from score -0.420503
[LightGBM] [Info] Start training from score -1.432104
[LightGBM] [Info] Start training from score -2.258782
Validation Accuracy: 76.27%
Validation F1 Score: 61.26%
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.96      0.87        75
           1       0.68      0.48      0.57        27
           2       0.56      0.31      0.40        16

    accuracy                           0.76       118
   macro avg       0.68      0.58      0.61       118
weighted avg       0.74      0.76      0.74       118



In [74]:

y_final = lgb_model.predict(X_accuracy_tfidf, num_iteration=lgb_model.best_iteration)
y_final_class = [int(label) for label in y_final.argmax(axis=1)]
y_final_class = np.array(y_final_class) + 1

In [75]:
test_df['Suspicious_Level'] = y_final_class

In [76]:
test_df['Suspicious_Level'].value_counts()

Suspicious_Level
1    942
2    171
3     58
Name: count, dtype: int64

In [77]:
test_df['Suspicious_Level'].value_counts()

Suspicious_Level
1    942
2    171
3     58
Name: count, dtype: int64

In [78]:
test_df[['MessageId', 'Suspicious_Level']].to_csv('lgbm_1000.csv', index=False)