## Importing libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.metrics import classification_report
import bs4
from collections import defaultdict

WORKING_DIR = '.'

# Dataset_educ (EDA)

We will determine the type of each column as well as if there are any NaN elements(in case there are, then the dataset is not correctly created)

In [2]:
dataset_educ = pd.read_json(f'{WORKING_DIR}/Dataset_educ_1.1.json', orient = 'index')
dataset_div = pd.read_json(f'{WORKING_DIR}/Dataset_div2_final.json', orient = 'index')

print(dataset_educ.info())

<class 'pandas.core.frame.DataFrame'>
Index: 937 entries, https://codeforces.com/blog/entry/101161?#comment-898006 to https://codeforces.com/blog/entry/105164?#comment-953522
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              937 non-null    int64         
 1   father_id       937 non-null    int64         
 2   username        937 non-null    object        
 3   text            937 non-null    object        
 4   comment_rating  937 non-null    int64         
 5   timestamp       937 non-null    datetime64[ns]
 6   problem         937 non-null    object        
 7   label           937 non-null    object        
 8   round_id        937 non-null    object        
 9   round_name      937 non-null    object        
dtypes: datetime64[ns](1), int64(3), object(6)
memory usage: 80.5+ KB
None


In [3]:
regex_link_ful = re.compile('<a href.*\/contest/.*/submission/.*<\/a>')
code_regex = re.compile('<code>(\s|.)*?<\/code>')
def preprocess_for_transfomers(texts, problems):
  preprocessed_texts = []
  for t, p in zip(texts, problems):
    t_codes = code_regex.sub(' (code) ', t)
    t_link = regex_link_ful.sub(f' (link to problem {p}) ', t_codes)
    bs = bs4.BeautifulSoup(t_link)
    preprocessed_texts.append(bs.text)
  
  return preprocessed_texts

dataset_educ.loc[~(dataset_educ['label'] == 'Irrelevant'), 'label'] = 'Relevant'
preprocessed_text = preprocess_for_transfomers(dataset_educ['text'], dataset_educ['problem'])
dataset_educ["preprocessed_text"] = preprocessed_text

dataset_div.loc[~(dataset_div['label'] == 'Irrelevant'), 'label'] = 'Relevant'
preprocessed_text = preprocess_for_transfomers(dataset_div['text'], dataset_div['problem'])
dataset_div["preprocessed_text"] = preprocessed_text

In [4]:
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
import numpy as np
import random
RANDOM_SEED = 443
DEVICE = "cuda"

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

groups = list(dataset_educ.groupby(lambda k : k.split("?")[0]))
random.shuffle(groups)

train_groups = groups[:10]
validation_groups = groups[10:13]
test_groups = groups[13:]

train_educ_dataset = dataset_educ.loc[[idx for _, g in train_groups for idx in g.index.tolist()]]
val_educ_dataset = dataset_educ.loc[[idx for _, g in validation_groups for idx in g.index.tolist()]]
test_educ_dataset = dataset_educ.loc[[idx for _, g in test_groups for idx in g.index.tolist()]]

from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
train_encoded_labels = labelEncoder.fit_transform(train_educ_dataset["label"])
val_encoded_labels = labelEncoder.transform(val_educ_dataset["label"])
test_encoded_labels = labelEncoder.transform(test_educ_dataset["label"])

div_encoded_labels = labelEncoder.transform(dataset_div["label"])


In [5]:
print(f"Number comments with problem:{len(train_educ_dataset[train_educ_dataset['problem'] != 'Irrelevant'])}")
print(f"Number comments with problem:{len(val_educ_dataset[val_educ_dataset['problem'] != 'Irrelevant'])}")
print(f"Number comments with problem:{len(test_educ_dataset[test_educ_dataset['problem'] != 'Irrelevant'])}")


Number comments with problem:356
Number comments with problem:122
Number comments with problem:96


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import numpy as np

vectorizer = TfidfVectorizer(stop_words = 'english')
vectorizer.fit(train_educ_dataset['preprocessed_text'])

In [7]:
import numpy as np
train_X = np.asarray(vectorizer.transform(train_educ_dataset['preprocessed_text']).todense())
validate_X = np.asarray(vectorizer.transform(val_educ_dataset['preprocessed_text']).todense())
test_X = np.asarray(vectorizer.transform(test_educ_dataset['preprocessed_text']).todense())

div_X = np.asarray(vectorizer.transform(dataset_div['preprocessed_text']).todense())

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(min_samples_leaf = 4, class_weight='balanced_subsample', random_state=24, max_features = 60)
rf.fit(train_X, train_encoded_labels)

predicted_Y = rf.predict(validate_X)

print(classification_report(val_encoded_labels, predicted_Y))

forest_importances = pd.Series(rf.feature_importances_, index=vectorizer.get_feature_names_out())

              precision    recall  f1-score   support

           0       0.67      0.81      0.73        74
           1       0.85      0.73      0.79       112

    accuracy                           0.76       186
   macro avg       0.76      0.77      0.76       186
weighted avg       0.78      0.76      0.77       186



In [9]:
predicted_Y = rf.predict(test_X)
print(classification_report(test_encoded_labels, predicted_Y))

              precision    recall  f1-score   support

           0       0.70      0.82      0.76        74
           1       0.82      0.70      0.75        86

    accuracy                           0.76       160
   macro avg       0.76      0.76      0.76       160
weighted avg       0.77      0.76      0.76       160



In [10]:
predicted_Y = rf.predict(div_X)
print(classification_report(div_encoded_labels, predicted_Y))

              precision    recall  f1-score   support

           0       0.88      0.73      0.80       393
           1       0.41      0.66      0.50       111

    accuracy                           0.71       504
   macro avg       0.65      0.69      0.65       504
weighted avg       0.78      0.71      0.73       504

