In [None]:
import os
import pandas as pd

import re
import time
import random
import string
from typing import Any

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder


import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
RANDOM_SEED = 42
ROOT_DIR = '/kaggle/input/disinformation-detection/'

In [None]:
def seed_all(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
seed_all(RANDOM_SEED)

### Let's see data

In [None]:
train_df = pd.read_csv(os.path.join(ROOT_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(ROOT_DIR, 'test.csv'))

In [None]:
train_df.groupby('Suspicious_Level').count()

# The simplest possible solution :)
After classifying everything as the class - 1

In [None]:
383 / 587 # Frequentist probability of first class

In [None]:
139 / 587 # Frequentist probability of second class

In [None]:
65 / 587 # Frequentist probability of third class

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df['Content'], train_df['Suspicious_Level'], test_size=0.2, random_state=RANDOM_SEED, stratify=train_df['Suspicious_Level']
)


Assighn props to each class

In [None]:
numbers = [1, 2, 3]
probabilities = [0.6525, 0.2368, 0.1107]

In [None]:
X_val_preds = [np.random.choice(numbers, p=probabilities) for _ in range(len(X_val))]

In [None]:
f1_score(y_val, X_val_preds, average=None, labels=[1, 2, 3])

In [None]:
f1_score(y_val, X_val_preds, average='macro', labels=[1, 2, 3])

### Create submission

In [None]:
submission_df = pd.read_csv(os.path.join(ROOT_DIR, 'sample_submission.csv'))

In [None]:
submission_df.apply(lambda x: np.random.choice(numbers, p=probabilities))

In [None]:
submission_df['Suspicious_Level'] = [np.random.choice(numbers, p=probabilities) for _ in range(len(submission_df))]

In [None]:
submission_df.set_index('MessageId', inplace=True)
submission_df

In [None]:
submission_df.to_csv('simplest-solution.csv')

## The simple solution [TF-IDF + LogReg]

### Realy basic data preprocessing

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))


def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    
    # Removing punctuation
    text = "".join([char for char in text if char not in string.punctuation])
    
    # Removing stop words
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    return ' '.join(words)

In [None]:
X_prep = [preprocess_text(text) for text in train_df['Content'].tolist()]

### TF-IDF Vectorization and Data Split

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You can adjust max_features as needed
X = tfidf_vectorizer.fit_transform(X_prep,)

In [None]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    X, train_df['Suspicious_Level'].tolist(), test_size=0.2, random_state=RANDOM_SEED, stratify=train_df['Suspicious_Level']
)

### Train Logistic Regression model

In [None]:
classifier = LogisticRegression(max_iter=1000)  # You can adjust hyperparameters as needed
classifier.fit(X_train, y_train)

In [None]:
# Predict on the val set
y_pred = classifier.predict(X_val)

In [None]:
# Evaluate the model
print(f1_score(y_val, y_pred, average=None, labels=[1, 2, 3]))
print(f1_score(y_val, y_pred, average='macro', labels=[1, 2, 3]))

As we can see the results is poor. Let's add some weighing to balance unbalanced data

In [None]:
classifier_weighted = LogisticRegression(
    max_iter=1000,
    class_weight={1: 1-0.6525, 2: 1-0.2368, 3: 1-0.1107} # btw, here is a leakage, do you know why? (tip: we'll evaluate our train results on validation set)
)
classifier_weighted.fit(X_train, y_train)

In [None]:
# Predict on the val set
y_pred = classifier_weighted.predict(X_val)

In [None]:
# Evaluate the model
print(f1_score(y_val, y_pred, average=None, labels=[1, 2, 3]))
print(f1_score(y_val, y_pred, average='macro', labels=[1, 2, 3]))

Now it looks better than model without weighting. Play we with hyperparameters search I'll leave to you.

### Create submission

In [None]:
submission_df = pd.read_csv(os.path.join(ROOT_DIR, 'sample_submission.csv'))

In [None]:
classifier_all_train_data = LogisticRegression(
    max_iter=1000,
    class_weight={1: 1-0.6525, 2: 1-0.2368, 3: 1-0.1107}
)
classifier_all_train_data.fit(X, train_df['Suspicious_Level'].tolist())

In [None]:
X_test = tfidf_vectorizer.transform([preprocess_text(text) for text in test_df['Content'].tolist()])

In [None]:
y_pred = classifier_all_train_data.predict(X_test)

In [None]:
submission_df['Suspicious_Level'] = y_pred
submission_df.set_index('MessageId', inplace=True)

In [None]:
submission_df.to_csv('tfidf-reg-solution.csv')