# Preprcoessing

Here we used these steps for text cleaning:

- fixing unicodes
- removing specials like a phone number, email, url, new lines, ...
- cleaning HTMLs
- normalizing
- removing emojis

In [7]:
import re
import hazm
from cleantext import clean

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def cleaning(text):
    text = text.strip()
    
    # regular cleaning
    text = clean(text,
        fix_unicode=True,
        to_ascii=False,
        lower=True,
        no_line_breaks=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False,
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_digit="0",
        replace_with_currency_symbol="",
    )

    # cleaning htmls
    text = cleanhtml(text)
    
    # normalizing
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    
    # removing wierd patterns
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)
    text = wierd_pattern.sub(r'', text)
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)
    return text

# Multi-label classification

In [48]:
import pandas as pd

df = pd.read_csv('/content/thresh5.csv')
df.head()

Unnamed: 0,text,Anger,Fear,Happiness,Hatred,Sadness,Wonder
0,کرونا رو شکست میدهیم؟\nمرحله بعد چه گوهی میخوا...,0,0,0,0,0,0
1,اگر در چند ماه اخیر تصمیم داشته اید وارد بورس ...,0,0,0,0,0,0
2,یکی از پدرسوختگی های #برانداز اینه که ظاهرا ژس...,1,0,0,0,1,0
3,یکی از دوستای دبستانم,0,0,0,0,0,0
4,@username اینقدر گرفتار مسایل میشی که تخصص از ...,0,0,0,0,0,0


In [49]:
df['text'] = df['text'].apply(cleaning)
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,text,Anger,Fear,Happiness,Hatred,Sadness,Wonder
0,چرا رو هرکی کراش دارم وقتی بش میرسم حالم ازش ب...,0,0,0,0,0,0
1,"تو آینه همه قشنگن, اگه راست میگی با دوربین سلف...",0,0,0,0,0,0
2,توییتر شده از لاک جیغ تا خدا من_همونم_که,0,0,0,0,0,0
3,بلد بود یه بار گفت تنکیو,0,0,0,0,0,0
4,@username دیشب یکی می‌گفت بارون رو کرونا اثری ...,0,0,0,0,0,0


In [50]:
from tqdm import tqdm

rowsLabels = []
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    rowLabels = [row["Anger"], row["Fear"], row["Happiness"], row["Hatred"], row["Sadness"], row["Wonder"]]
    rowsLabels.append(rowLabels)
df['labels'] = rowsLabels
df = df.drop(columns=["Anger", "Fear", "Happiness", "Hatred", "Sadness", "Wonder"])
df.head()

100%|██████████| 30000/30000 [00:01<00:00, 16919.01it/s]


Unnamed: 0,text,labels
0,چرا رو هرکی کراش دارم وقتی بش میرسم حالم ازش ب...,"[0, 0, 0, 0, 0, 0]"
1,"تو آینه همه قشنگن, اگه راست میگی با دوربین سلف...","[0, 0, 0, 0, 0, 0]"
2,توییتر شده از لاک جیغ تا خدا من_همونم_که,"[0, 0, 0, 0, 0, 0]"
3,بلد بود یه بار گفت تنکیو,"[0, 0, 0, 0, 0, 0]"
4,@username دیشب یکی می‌گفت بارون رو کرونا اثری ...,"[0, 0, 0, 0, 0, 0]"


In [51]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(f"Device is {device}")

Device is cuda


In [52]:
import numpy as np

def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set(np.where(y_true[i])[0])
        set_pred = set(np.where(y_pred[i])[0])
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [53]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 1e-03
MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'

In [54]:
from transformers import BertConfig, BertTokenizer

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)

In [55]:
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)


class MultiLabelDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [56]:
train_size = 0.8
train_data=df.sample(frac=train_size,random_state=200)
test_data=df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (30000, 2)
TRAIN Dataset: (24000, 2)
TEST Dataset: (6000, 2)


In [57]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [58]:
from transformers import BertModel

class ParsBertModel(torch.nn.Module):
    def __init__(self):
        super(ParsBertModel, self).__init__()
        self.l1 = DistilBertModel.from_pretrained(MODEL_NAME_OR_PATH)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 6)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = ParsBertModel()
model.to(device)

You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing DistilBertModel: ['bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.3.attention.output.dense.bias', 'bert.encoder.layer.5.output.LayerNorm.bias', 'bert.encoder.layer.9.attention.output.dense.bias', 'cls.seq_relationship.weight', 'bert.encoder.layer.6.attention.output.LayerNorm.weight', 'bert.encoder.layer.8.attention.self.query.weight', 'bert.encoder.layer.3.attention.self.key.bias', 'bert.encoder.layer.6.attention.self.value.bias', 'bert.encoder.layer.2.attention.output.dense.bias', 'bert.encoder.layer.11.intermediate.dense.bias', 'bert.encoder.layer.5.intermediate.dense.weight', 'bert.encoder.layer.10.output.dense.bia

ParsBertModel(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(100000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_featur

In [59]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [60]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [61]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        last_loss = loss.item()
        
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch}, Loss:  {last_loss}')

In [62]:
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
1500it [09:23,  2.66it/s]


Epoch: 0, Loss:  2.975951792905107e-05


1500it [09:22,  2.67it/s]


Epoch: 1, Loss:  0.00042286317329853773


1500it [09:22,  2.67it/s]


Epoch: 2, Loss:  2.0646481061703525e-05


1500it [09:21,  2.67it/s]


Epoch: 3, Loss:  7.27107108104974e-05


1500it [09:22,  2.67it/s]

Epoch: 4, Loss:  3.692904647323303e-05





In [63]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [64]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >=0.5

375it [00:46,  8.06it/s]


In [65]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

Hamming Score = 0.9973333333333333
Hamming Loss = 0.00047222222222222224


In [66]:
anger_final_outputs = [i[0] for i in final_outputs]
fear_final_outputs = [i[1] for i in final_outputs]
happiness_final_outputs = [i[2] for i in final_outputs]
hatred_final_outputs = [i[3] for i in final_outputs]
sadness_final_outputs = [i[4] for i in final_outputs]
wonder_final_outputs = [i[5] for i in final_outputs]

anger_targets = [i[0] for i in targets]
fear_targets = [i[1] for i in targets]
happiness_targets = [i[2] for i in targets]
hatred_targets = [i[3] for i in targets]
sadness_targets = [i[4] for i in targets]
wonder_targets = [i[5] for i in targets]

In [67]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

label_finals = [anger_final_outputs, fear_final_outputs, happiness_final_outputs, hatred_final_outputs, sadness_final_outputs, wonder_final_outputs]
label_targets = [anger_targets, fear_targets, happiness_targets, hatred_targets, sadness_targets, wonder_targets]
label = ["Anger", "Fear", "Happiness", "Hatred", "Sadness", "Wonder"]

for i in range(len(label)):
  print(f"Accuracy for {label[i]}: {accuracy_score(label_targets[i], label_finals[i])}")
  print(f"Precision for {label[i]}: {precision_score(label_targets[i], label_finals[i])}")
  print(f"Recall for {label[i]}: {recall_score(label_targets[i], label_finals[i])}")
  print(f"F1 for {label[i]}: {f1_score(label_targets[i], label_finals[i])}")
  print("\n ------------------------------------------------------------- \n")

Accuracy for Anger: 0.9991666666666666
Precision for Anger: 0.0
Recall for Anger: 0.0
F1 for Anger: 0.0

 ------------------------------------------------------------- 

Accuracy for Fear: 1.0
Precision for Fear: 0.0
Recall for Fear: 0.0
F1 for Fear: 0.0

 ------------------------------------------------------------- 

Accuracy for Happiness: 0.9996666666666667
Precision for Happiness: 0.0
Recall for Happiness: 0.0
F1 for Happiness: 0.0

 ------------------------------------------------------------- 

Accuracy for Hatred: 0.9995
Precision for Hatred: 0.0
Recall for Hatred: 0.0
F1 for Hatred: 0.0

 ------------------------------------------------------------- 

Accuracy for Sadness: 0.9988333333333334
Precision for Sadness: 0.0
Recall for Sadness: 0.0
F1 for Sadness: 0.0

 ------------------------------------------------------------- 

Accuracy for Wonder: 1.0
Precision for Wonder: 0.0
Recall for Wonder: 0.0
F1 for Wonder: 0.0

 ----------------------------------------------------------