# Import packages

In [32]:
!pip install pytorch-lightning
!pip install transformers
# !pip install textattack

[0m

In [33]:
import random
import os
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

from urllib import request
from dont_patronize_me import DontPatronizeMe
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.utils.data import Dataset, WeightedRandomSampler, DataLoader
from transformers import AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup
# from textattack.augmentation import CheckListAugmenter, BackTranslationAugmenter, WordNetAugmenter, EasyDataAugmenter, CLAREAugmenter


In [34]:
# dont_patronize_me.py
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [35]:
# official scorer evalutaion.py
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/evaluation.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/evaluation.py


In [36]:
random.seed(42)

## Define useful classes

In [37]:
def get_sample_weighs(data):
    # ratio_n = data['label'].value_counts().loc[positive_lavel]/len(data)
    # ratio_p = data['label'].value_counts().loc[negative_label]/len(data)

    ratio_p = data['label'].sum()/len(data)
    ratio_n = (1-data['label']).sum()/len(data)

    # calculate weight for weighted random sampling
    w_p = 1/np.sqrt(ratio_p)
    w_n = 1/np.sqrt(ratio_n)

    sample_weights = np.where(data['label'] == 1, w_p, w_n)
    return sample_weights

In [38]:
class dpm_Dataset(Dataset):

    def __init__(self, data, tokenizer, attributes, max_token_len: int = 128, sample=5000):
        self.data = data
        self.tokenizer = tokenizer
        self.attributes = attributes
        self.max_token_len = max_token_len
        self.sample = sample
        sample_weights = get_sample_weighs(self.data)
        self.sampler = WeightedRandomSampler(
            weights=sample_weights, num_samples=len(self.data))
        # self.sampler = None

        self._prepare_data()

    def _prepare_data(self):

        communities = pd.get_dummies(self.data.community)
        self.community_labels = communities.columns
        self.data = self.data.join(communities)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data.iloc[index]
        comment = str(item.text)
        attributes = torch.FloatTensor(item[self.attributes])
        communities = torch.FloatTensor(item[self.community_labels])
        tokens = self.tokenizer.encode_plus(comment,
                                            add_special_tokens=True,
                                            return_tensors='pt',
                                            truncation=True,
                                            padding='max_length',
                                            max_length=self.max_token_len,
                                            return_attention_mask=True)
        return {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten(), 'labels': attributes, 'communities': communities}


In [39]:
class dpm_Data_Module(pl.LightningDataModule):

    def __init__(self, train_raw, val_raw, test_raw, attributes, batch_size: int = 16, max_token_length: int = 256,  model_name='roberta-large'):
        super().__init__()
        self.train_raw = train_raw
        self.val_raw = val_raw
        self.test_raw = test_raw
        self.attributes = attributes
        self.batch_size = batch_size
        self.max_token_length = max_token_length
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def setup(self, stage=None):
        if stage in (None, "fit"):
            self.train_dataset = dpm_Dataset(
                self.train_raw, attributes=self.attributes, tokenizer=self.tokenizer)
            self.val_dataset = dpm_Dataset(
                self.val_raw, attributes=self.attributes, tokenizer=self.tokenizer, sample=None)

        if stage == 'predict':
            self.test_dataset = dpm_Dataset(
                self.test_raw, attributes=self.attributes, tokenizer=self.tokenizer, sample=None)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=4, sampler=self.train_dataset.sampler)
        # return DataLoader(self.train_dataset, batch_size = self.batch_size, num_workers=4, sampler=None)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=4, shuffle=False)

    def predict_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=4, shuffle=False)


In [40]:
class dpm_Classifier(pl.LightningModule):

    def __init__(self, config: dict):
        super().__init__()
        self.config = config
        self.pretrained_model = AutoModel.from_pretrained(
            config['model_name'], return_dict=True)
        self.hidden = nn.Linear(self.pretrained_model.config.hidden_size +
                                config['n_communities'], self.pretrained_model.config.hidden_size+config['n_communities'])
        self.classifier = nn.Linear(
            self.pretrained_model.config.hidden_size+config['n_communities'], self.config['n_labels'])
        # self.softmax = nn.Softmax(dim=1)

        torch.nn.init.xavier_uniform_(self.classifier.weight)
        torch.nn.init.xavier_uniform_(self.hidden.weight)

        self.loss_func = nn.BCEWithLogitsLoss(reduction='mean')
        # self.loss_func = nn.CrossEntropyLoss()
        self.dropout = nn.Dropout()

    def forward(self, input_ids, attention_mask, communities, labels=None):
        # roberta layer
        output = self.pretrained_model(
            input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = torch.mean(output.last_hidden_state, 1)
        # final logits
        # print(pooled_output.shape)
        pooled_output = self.dropout(pooled_output)
        pooled_output = torch.cat((pooled_output, communities), 1)
        pooled_output = self.hidden(pooled_output)
        pooled_output = F.relu(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = 0
        if labels is not None:
            # print(logits.view(-1, self.config['n_labels']).shape, labels.view(-1, self.config['n_labels']).shape)
            loss = self.loss_func(
                logits.view(-1, self.config['n_labels']), labels.view(-1, self.config['n_labels']))
            # loss = self.loss_func(logits,  labels)
            # print(loss)

        return loss, logits

    def training_step(self, batch, batch_index):
        loss, outputs = self(**batch)
        # self.log("train_loss ", loss, prog_bar = True, logger=True)
        self.log("train_loss", loss, prog_bar=True,
                 logger=True,  on_step=True, on_epoch=False)
        return {"loss": loss, "predictions": outputs, "labels": batch["labels"]}

    def validation_step(self, batch, batch_index):
        loss, outputs = self(**batch)
        self.log("validation_loss", loss, prog_bar=True, logger=True)
        return {"val_loss": loss, "predictions": outputs, "labels": batch["labels"]}

    def predict_step(self, batch, batch_index):
        loss, outputs = self(**batch)

        return outputs

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['weight_decay'])
        
        total_steps = self.config['train_size']/self.config['batch_size']
        warmup_steps = math.floor(total_steps * self.config['warmup'])
        warmup_steps = math.floor(total_steps * self.config['warmup'])
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, warmup_steps, total_steps)
        return [optimizer], [scheduler]


In [41]:
from torch.optim.lr_scheduler import LambdaLR

In [42]:
class dpm_llrd_Classifier(pl.LightningModule):

    def __init__(self, config: dict):
        super().__init__()
        self.config = config
        self.pretrained_model = AutoModel.from_pretrained(
            config['model_name'], return_dict=True)
        self.hidden = nn.Linear(self.pretrained_model.config.hidden_size +
                                config['n_communities'], self.pretrained_model.config.hidden_size+config['n_communities'])
        self.classifier = nn.Linear(
            self.pretrained_model.config.hidden_size+config['n_communities'], self.config['n_labels'])
        # self.softmax = nn.Softmax(dim=1)

        torch.nn.init.xavier_uniform_(self.classifier.weight)
        torch.nn.init.xavier_uniform_(self.hidden.weight)

        self.loss_func = nn.BCEWithLogitsLoss(reduction='mean')
        # self.loss_func = nn.CrossEntropyLoss()
        self.dropout = nn.Dropout()

    def forward(self, input_ids, attention_mask, communities, labels=None):
        # roberta layer
        output = self.pretrained_model(
            input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = torch.mean(output.last_hidden_state, 1)
        # final logits
        # print(pooled_output.shape)
        pooled_output = self.dropout(pooled_output)
        pooled_output = torch.cat((pooled_output, communities), 1)
        pooled_output = self.hidden(pooled_output)
        pooled_output = F.relu(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = 0
        if labels is not None:
            # print(logits.view(-1, self.config['n_labels']).shape, labels.view(-1, self.config['n_labels']).shape)
            loss = self.loss_func(
                logits.view(-1, self.config['n_labels']), labels.view(-1, self.config['n_labels']))
            # loss = self.loss_func(logits,  labels)
            # print(loss)

        return loss, logits

    def training_step(self, batch, batch_index):
        loss, outputs = self(**batch)
        # self.log("train_loss ", loss, prog_bar = True, logger=True)
        self.log("train_loss", loss, prog_bar=True,
                 logger=True,  on_step=True, on_epoch=False)
        return {"loss": loss, "predictions": outputs, "labels": batch["labels"]}

    def validation_step(self, batch, batch_index):
        loss, outputs = self(**batch)
        self.log("validation_loss", loss, prog_bar=True, logger=True)
        return {"val_loss": loss, "predictions": outputs, "labels": batch["labels"]}

    def predict_step(self, batch, batch_index):
        loss, outputs = self(**batch)

        return outputs

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['weight_decay'])
        
        total_steps = self.config['train_size']/self.config['batch_size']
        warmup_steps = math.floor(total_steps * self.config['warmup'])
        warmup_steps = math.floor(total_steps * self.config['warmup'])
        
        def llrd_lambda(current_step):
            num_warmup_steps = warmup_steps
            layerwise_lr_factor = 0.96 # Your layerwise learning rate decay factor
            num_layers = len(self.pretrained_model.encoder.layer) + 2 # Number of layers in your model
            if current_step < num_warmup_steps:
                return float(current_step) / float(max(1, num_warmup_steps))
            else:
                layer_lr_decay = layerwise_lr_factor ** (current_step - num_warmup_steps)
                layer_lr_decay = min(max(layer_lr_decay, 5e-6), 1e-4) # Your minimum learning rate
                return layer_lr_decay
        
        llrd_scheduler = LambdaLR(optimizer, lr_lambda =llrd_lambda)
        
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, warmup_steps, total_steps)
        
        return [optimizer], [llrd_scheduler, scheduler]
    
    

## Helper functions

In [43]:
def textattack_aug(data, augmenter, positive_only=True, ignore_original=True):
    # augmenter = CheckListAugmenter(pct_words_to_swap=0.2, transformations_per_example=5)
    if positive_only:
        samples = data[data['label'] == 1]
    else:
        samples = data
    # pos_sample = data
    aug_df = pd.DataFrame([]) if ignore_original else data.copy()

    for row in samples.itertuples():
        text = row.text
        # EDA pipeline
        augmented_texts = augmenter.augment(text)
        for new_sentence in augmented_texts:
            aug_df = pd.concat([aug_df, pd.DataFrame(
                {'par_id': row.par_id, 'community': row.community, 'text': new_sentence, 'label': row.label}, index=[len(aug_df)])])

    return aug_df


# Load data of task 1

In [44]:
dpm = DontPatronizeMe('.', '.')
dpm.load_task1()


In [45]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
    with open(outf_path, 'w') as outf:
        for pi in p:
            outf.write(','.join([str(k) for k in pi])+'\n')


trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)


In [46]:
def rebuild_data(id, data, shuffle=False):
    rows = []  # will contain par_id, label and text
    for idx in range(len(id)):
        parid = id.par_id[idx]
        # print(parid)
        # select row from original dataset to retrieve `text` and binary label
        keyword = data.loc[data.par_id == parid].keyword.values[0]
        text = data.loc[data.par_id == parid].text.values[0]
        label = data.loc[data.par_id == parid].label.values[0]
        rows.append({
            'par_id': parid,
            'community': keyword,
            'text': text,
            'label': label
        })

    rebuilt_data = pd.DataFrame(rows)
    # shuffle
    if shuffle:
        rebuilt_data.sample(frac=1)

    return rebuilt_data


In [47]:
dev_data = rebuild_data(teids, dpm.train_task1_df)
total_train_data = rebuild_data(trids, dpm.train_task1_df)

In [48]:
# augmenter = CheckListAugmenter(
#     pct_words_to_swap=0.2, transformations_per_example=4)
# augmented_train_data = textattack_aug(total_train_data, augmenter)
# augmented_train_data.drop_duplicates(subset=['text'], inplace=True)
# augmented_train_data.reset_index(drop=True, inplace=True)


In [49]:
# augmented_train_data.to_csv('checklist_4.csv', index=False)

In [50]:
augmented_train_data = pd.read_csv('checklist_4.csv')

In [51]:
final_train_data = pd.concat([total_train_data, augmented_train_data])

In [None]:
final_test_data = pd.read_table('task4_test.tsv',names=['par_id','community','country','text'],index_col=0)
final_test_data['label'] = 0

In [52]:
# try:
#     augmented_train_data = pd.read_csv('augmented_train_data.csv')
# except Exception as e:
#     augmenter = CheckListAugmenter(
#         pct_words_to_swap=0.2, transformations_per_example=2)
#     augmented_train_data = textattack_aug(total_train_data, augmenter)
#     augmented_train_data.drop_duplicates(subset=['text'], inplace=True)
#     augmented_train_data.reset_index(drop=True, inplace=True)

# # split train data into train and validation
# train_data, val_data = train_test_split(augmented_train_data, test_size=0.2, random_state=42)  # Shuffle is True by default

# train_data, val_data = train_test_split(final_train_data, test_size=0.2, random_state=42)  # Shuffle is True by default
# train_data.reset_index(drop=True, inplace=True)
# val_data.reset_index(drop=True, inplace=True)


In [53]:
# augmented_train_data = pd.read_csv('augmented_train_data.csv')
# # BackTranslationAugmenter
# augmenter = BackTranslationAugmenter(
#     pct_words_to_swap=0.2, transformations_per_example=2)
# aug_pos_df = textattack_aug(augmented_train_data, augmenter)
# aug_pos_df.to_csv('bt_augmented.csv', index=False)

In [54]:
# augmented_train_data.to_csv('augmented_train_data.csv', index=False)

In [55]:
# # CLAREAugmenter
# augmenter = CLAREAugmenter(pct_words_to_swap=0.2,
#                            transformations_per_example=5)
# aug_pos_df = textattack_aug(total_train_data, augmenter)
# aug_pos_df.to_csv('clare_augmented.csv', index=False)

# # EasyDataAugmenter
# augmenter = EasyDataAugmenter(
#     pct_words_to_swap=0.2, transformations_per_example=5)
# aug_pos_df = textattack_aug(total_train_data, augmenter)
# aug_pos_df.to_csv('eda_augmented.csv', index=False)

# # BackTranslationAugmenter
# augmenter = BackTranslationAugmenter(
#     pct_words_to_swap=0.2, transformations_per_example=5)
# aug_pos_df = textattack_aug(total_train_data, augmenter)
# aug_pos_df.to_csv('bt_augmented.csv', index=False)

# Load Transformer model

In [56]:
# Hyperparameters
attributes = ['label']
batch_size = 16

dpm_data_module = dpm_Data_Module(
    final_train_data, dev_data, final_test_data, attributes=attributes, batch_size=batch_size)
dpm_data_module.setup()
config = {
    # 'model_name': 'microsoft/deberta-v3-large',
    'model_name': 'roberta-large',
    'n_labels': len(attributes),
    'batch_size': batch_size,
    'lr': 1e-5,
    'warmup': 0.2,
    'train_size': len(dpm_data_module.train_dataloader()),
    'weight_decay': 0.001,
    'n_epochs': 10,
    'n_communities': len(final_train_data.community.unique())
}


# Train model

In [57]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [58]:
%reload_ext tensorboard
%tensorboard --logdir ./tb_logs --bind_all

Reusing TensorBoard on port 6006 (pid 1692), started 0:52:14 ago. (Use '!kill 1692' to kill it.)

In [59]:
# model = dpm_Classifier(config)
model = dpm_llrd_Classifier(config)
logger = pl.loggers.TensorBoardLogger("tb_logs", name="my_model")
early_stop_callback = EarlyStopping(
    monitor='validation_loss',
    patience=1,
    verbose=False,
    mode='min'
)

checkpoint_callback = ModelCheckpoint(
    monitor='validation_loss',
    dirpath='./checkpoints/',
    filename='sample-{epoch:02d}',
    save_last=True
)

trainer = pl.Trainer(max_epochs=config['n_epochs'],
                     logger=logger,
                     accelerator='gpu',
                     devices=1,
                     num_sanity_val_steps=50,
                     callbacks=[early_stop_callback])
trainer.fit(model, dpm_data_module)


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX A4000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_preci

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [60]:
print(len(model.pretrained_model.encoder.layer))

24


# Test

In [68]:
def classify_raw_comments(model, dm):
    predictions = trainer.predict(model, datamodule=dm)
    flattened_predictions = np.stack(
        [torch.sigmoid(torch.Tensor(p)) for batch in predictions for p in batch])
    return flattened_predictions


In [None]:
# predictions on dev data
dev_data_module = dpm_Data_Module(
    [], [], dev_data, attributes=['label'], batch_size=16)
dev_predictions = classify_raw_comments(model, dev_data_module)
true_labels = np.array(dev_data[attributes])

print(true_labels.shape)
for i, attribute in enumerate(attributes):
    print(classification_report(true_labels[:,i].astype(int), dev_predictions[:,i]>0.5))

dev_results = np.where(dev_predictions > 0.5, 1, 0)
    

In [69]:
# true_labels = np.array(dev_data[attributes])
# predictions = classify_raw_comments(model, dpm_data_module)
# print(true_labels.shape)

You are using a CUDA device ('NVIDIA RTX A4000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 537it [00:00, ?it/s]

(2094, 1)


In [70]:
# for i, attribute in enumerate(attributes):
#     print(attribute)
#     print(classification_report(true_labels[:,i].astype(int), predictions[:,i]>0.5))

label
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      1895
           1       0.52      0.67      0.59       199

    accuracy                           0.91      2094
   macro avg       0.74      0.80      0.77      2094
weighted avg       0.92      0.91      0.92      2094



In [71]:
# final_test_data = pd.read_table('task4_test.tsv',names=['par_id','community','country','text'],index_col=0)
# final_test_data


Unnamed: 0,par_id,community,country,text
t_0,@@7258997,vulnerable,us,"In the meantime , conservatives are working to..."
t_1,@@16397324,women,pk,In most poor households with no education chil...
t_2,@@16257812,migrant,ca,The real question is not whether immigration i...
t_3,@@3509652,migrant,gb,"In total , the country 's immigrant population..."
t_4,@@477506,vulnerable,ca,"Members of the church , which is part of Ken C..."
...,...,...,...,...
t_3893,@@20319448,migrant,jm,In a letter dated Thursday to European Commiss...
t_3894,@@9990672,poor-families,au,They discovered that poor families with health...
t_3895,@@37984,migrant,ca,"She married at 19 , to Milan ( Emil ) Badovina..."
t_3896,@@9691377,immigrant,us,The United Kingdom is n't going to devolve int...


In [65]:
# predictions on test data
final_test_data = pd.read_table('task4_test.tsv',names=['par_id','community','country','text'],index_col=0)

# final_test_data = pd.read_csv('task4_test.tsv')
tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
final_test_data['label'] = 0
final_test_dataset = dpm_Dataset(
    final_test_data, attributes=None, tokenizer=tokenizer)
final_test_data_module = dpm_Data_Module([], [], final_test_data, attributes=['label'], batch_size=16)
test_predictions = classify_raw_comments(model, final_test_data_module)
test_results = np.where(test_predictions > 0.5, 1, 0)

  w_p = 1/np.sqrt(ratio_p)
You are using a CUDA device ('NVIDIA RTX A4000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
  w_p = 1/np.sqrt(ratio_p)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 537it [00:00, ?it/s]

In [None]:
np.savetxt('dev.txt', dev_results, fmt='%d')
np.savetxt('test.txt', test_results, fmt='%d')

# Simple Baseline
### bow + naive bayes

In [37]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
df = final_train_data
corpus = df['text'].tolist()
binary_labels = df['label'].tolist()

In [39]:
# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the vectorizer to the corpus and transform the corpus into a BOW representation
bow = vectorizer.fit_transform(corpus)

In [40]:


# Create a Naive Bayes classifier and fit it to the training data
classifier_nb = MultinomialNB()
classifier_nb.fit(bow, binary_labels)


In [41]:
X_test = list(test_data.text)
X_test_bow = vectorizer.transform(X_test)
y_test = list(test_data.label)

# Predict labels for the test data and calculate accuracy
y_pred_nb = classifier_nb.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred_nb)

In [42]:
from sklearn.metrics import classification_report
attibutes = ['labels']
report = classification_report(y_test, y_pred_nb)

In [43]:
print(report)

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1895
           1       0.36      0.36      0.36       199

    accuracy                           0.88      2094
   macro avg       0.65      0.65      0.65      2094
weighted avg       0.88      0.88      0.88      2094



### bow+logistic 

In [44]:
from sklearn.linear_model import LogisticRegression
classifier_lr = LogisticRegression(max_iter =1000)
classifier_lr.fit(bow, binary_labels)

In [45]:
y_pred_lr = classifier_lr.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred_lr)

In [46]:
report = classification_report(y_test, y_pred_lr)

In [47]:
print(report)

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1895
           1       0.35      0.28      0.31       199

    accuracy                           0.88      2094
   macro avg       0.64      0.61      0.62      2094
weighted avg       0.87      0.88      0.88      2094



### extract incorrect prediction

In [85]:
test_results = np.where(predictions > 0.5, 1, 0)
y_pred_plm = list(test_results.squeeze())

In [86]:

incorrect_indices_plm = [i for i in range(len(y_test)) if (y_test[i] != y_pred_plm[i] and y_test[i] == 1)]

In [87]:
incorrect_indices_nb = [i for i in range(len(y_test)) if (y_test[i] != y_pred_nb[i] and y_test[i] == 1)]

In [88]:
incorrect_indices_lr = [i for i in range(len(y_test)) if (y_test[i] != y_pred_lr[i] and y_test[i] == 1)]

In [89]:
len(incorrect_indices_plm)

58

In [90]:
len(incorrect_indices_nb)

128

In [91]:
len(incorrect_indices_lr)

144

In [1]:
# # for length question 
# incorrect_data = df.iloc[incorrect_indices_plm]
# word_counts = incorrect_data['text'].str.split().apply(len)
# print(word_counts)

In [2]:

# result = []

# for index in incorrect_indices_nb:
#     if index in incorrect_indices_lr and index not in incorrect_indices_plm and index not in result:
#         result.append(index)
    


In [3]:

# import pandas as pd
# import seaborn as sns
# sns.histplot(word_counts, kde=False, color='blue', bins=10)

In [4]:
# print(result)

In [5]:
# test_data.loc[25].text

In [6]:
# bins = pd.cut(word_counts, bins=range(0, max(word_counts)+30, 30))

# # Get the table of word counts for each bin
# word_count_table = word_counts.groupby(bins).count()

# print(word_count_table)

In [7]:
# test_data
# sub_df = test_data[test_data['label'] == 1]
# print(len(sub_df))
# word_counts = sub_df['text'].str.split().apply(len)
# bins = pd.cut(word_counts, bins=range(0, max(word_counts)+50, 30))

# # # Get the table of word counts for each bin
# word_count_table = word_counts.groupby(bins).count()

# print(word_count_table)