# Import pre-procesesd Data 

In [None]:
import pandas as pd
from tqdm.auto import tqdm
import random

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
ROOT = '/content/drive/MyDrive/toxicLanguageDetection/'

In [None]:
df = pd.read_csv(f'{ROOT}/data/processed_binary_dataset.csv')
MAX_TOXIC = 16225
MAX_DF = 50000

## Balance Dataset

In [None]:
def balance_dataset(df, MAX_TOXIC=None, MAX_DF=None):
    
    if not MAX_TOXIC:
        MAX_TOXIC = len(df)
    
    if not MAX_DF:
        MAX_DF = len(df)
        
    data = {'toxic'     : [],
            'non_toxic' : []}

    max_num_tox = 0

    for idx in tqdm(range(len(df))):
        
        row = df.loc[idx]

        if row.toxic == 0:
            
            if len(data['non_toxic']) >= MAX_DF:
                continue
                
            data['non_toxic'].append((row.comment,0))
            
        elif row.toxic == 1:
            
            if len(data['toxic']) >= MAX_TOXIC:
                continue
                
            data['toxic'].append((row.comment,1))
         
    dataset = list(data['toxic'] + data['non_toxic'])
    random.shuffle(dataset)
    
    return [i[0] for i in dataset], [i[1] for i in dataset]

In [None]:
train_x, train_y = balance_dataset(df, MAX_DF=MAX_DF, MAX_TOXIC=MAX_TOXIC)

  0%|          | 0/159571 [00:00<?, ?it/s]

In [None]:
print(f"Num Toxic: {len([i for i in train_y if i ==1])}\nNum Non-toxic: {len([i for i in train_y if i ==0])}")

Num Toxic: 16225
Num Non-toxic: 50000


## Build Vocab

In [None]:
import numpy as np

## Pretrained Glove Vectors

In [None]:
def open_vectors(glove_path):
    
    with open(glove_path, 'rb') as file:

        vectors = file.read().splitlines()
        
    return vectors

def process_vectors(raw_vectors):
    
    vocab = {}

    for vec in tqdm(raw_vectors, total=len(raw_vectors)):
        
        splat_vec = vec.decode().split(' ')
        word = splat_vec[0]
        vector = np.array(splat_vec[1:], dtype=float)
        vocab[word] = vector

    return vocab

In [None]:
vector_path = f'{ROOT}vectors/glove.6B.300d.txt'
raw_vectors = open_vectors(vector_path)
pretrained_dim = 300


In [None]:
vector_vocab = process_vectors(raw_vectors)

  0%|          | 0/400000 [00:00<?, ?it/s]

In [None]:
 def buildVocab(training_data, tokenize=None, rnn=None, pretrained=None):
    
    vocab={}
    word_counts = {}
    
    vocab['<sos>'] = 1
    vocab['<eos>'] = 2
    vocab['<oov>'] = 3
    
    for idx, line in enumerate(training_data):
        
        if tokenize:
            line = line.split(' ')
            
        for token in line:
            
            if token not in vocab:
                vocab[token] = len(vocab)
                word_counts[token] = 1
                
            else:
                word_counts[token] += 1
        
        if rnn:
            line.insert(0,'<sos>')
            line.append('<eos>')
            training_data[idx] = ' '.join(line)
            
    if rnn:
        return vocab, word_counts, training_data
    
    return vocab, word_counts

In [None]:
#vocab, word_counts, training_x = buildVocab(train_x, tokenize=True, rnn=True)

In [None]:
from torch.nn.utils.rnn import pad_sequence
import torch

In [None]:
class DataEncoder:
    
    def __init__(self, data,  modelFormat, vocab=None,threshold=None,
                       max_num=None, min_num= None, word_counts=None, 
                 pretrained=None, pretrained_dim=None):
        
        self.data=data
        
        self.modelFormat= modelFormat
        self.max_num = max_num
        self.min_num = min_num
        self.word_counts = word_counts
        self.threshold = threshold
        self.pretrained_dim = pretrained_dim
        

        if pretrained:
          self.pretrained = pretrained
          self.pretrained['<sos>'] = np.random.rand(self.pretrained_dim)
          self.pretrained['<eos>'] = np.random.rand(self.pretrained_dim)
          self.pretrained['<oov>'] = np.random.rand(self.pretrained_dim)
          self.pretrained['<pad>'] = np.random.rand(self.pretrained_dim)

          self.vocab ={word : idx for idx, word in enumerate(list(pretrained.keys()))}
          self.vectors = torch.Tensor(list(pretrained.values()))

        else:
          self.vocab=vocab

        self.idx2wrd = {idx : wrd for wrd, idx in self.vocab.items()}

        if not self.max_num:
            self.max_num = len(self.data)
            
        if not self.min_num:
            self.min_num = 0
            
    def encode(self, test=None, max_len=None):
        
        if self.modelFormat == 'ffnn':
            return self.encode_data_fnn()
        elif self.modelFormat == 'rnn':
            return self.encode_data_rnn(test, max_len)
        else:
            raise('I make sure you use a compatible model bringus')
            
    def encode_data_fnn(self):
    
        encoded_data = []
        empty_vec = np.zeros(len(self.vocab))

        for line in self.data:

            encoded_line = np.copy(empty_vec)
            
            if type(line) == str:
                line = line.split(' ')
                
            for token in line:

                if self.threshold:

                    if word_counts[token] > self.min_num and word_counts[token] < self.max_num:
                        encoded_line[vocab[token]] += 1

                    else:
                        continue

                encoded_line[self.vocab[token]] += 1

            encoded_data.append(encoded_line)

        return np.array(encoded_data)
    
    def encode_data_rnn(self, test=None, max_len=None, add_start_end=None):
        
        encoded_data = []
        
        if test:
            data=test
        
        else:
            data=self.data
        
        for line in data:

            encoded_line = []
            
            if type(line) == str:
                line = line.split(' ')

            if add_start_end:
              line.insert(0, '<sos>')
              line.append('<eos>')

            for token in line:

                if self.threshold:
                    
                    if token not in self.vocab:
                        encoded_line.append(vocab['oov'])
                        
                    else:
                        if word_counts[token] > self.min_num and word_counts[token] < self.max_num:
                            encoded_line.append(self.vocab[token])

                        else:
                            continue

                if token not in self.vocab:
                  encoded_line.append(self.vocab['<oov>'])

                else:
                  encoded_line.append(self.vocab[token])
            
            if max_len:
                encoded_data.append(torch.LongTensor(encoded_line[:max_len]))
            else:
                encoded_data.append(torch.LongTensor(encoded_line))


        if self.pretrained:
          return pad_sequence(encoded_data, batch_first=True, padding_value=self.vocab['<pad>'])

        else:
          return pad_sequence(encoded_data, batch_first=True, padding_value=0)
    
    def decode(self, encoded_line):
        
        not_decodes = [self.vocab['<oov>'], self.vocab['<eos>'], 
                      self.vocab['<sos>'], self.vocab['<pad>']]

        return [self.idx2wrd[i.item()] for i in encoded_line if i.item() not in not_decodes]
    
    def encodeMultiHot(line, vocab, test=None):
    

        empty_vec = np.zeros(len(vocab))

        for token in line:

            if token in vocab:
                empty_vec[vocab[token]] += 1


        return np.array(empty_vec)

In [None]:
data_encoder = DataEncoder(data=train_x, pretrained=vector_vocab, 
                           modelFormat='rnn', pretrained_dim=pretrained_dim)

In [None]:
encoded_train = data_encoder.encode(max_len=90)

## Pytorch Dataset and Model


In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
class rnnDataset(Dataset):
    
    def __init__(self, encoded_x, encoded_y, encoder):
        
        self.encoder = encoder
        self.encoded_x = encoded_x
        self.encoded_y = encoded_y
    
    def hottyY(self, int):
    
        if int == 0:
            return torch.Tensor([1, 0])
        elif int == 1:
            return torch.Tensor([0,1])
    
    def __len__(self):
        
        return len(self.encoded_x)
    
    def __getitem__(self, idx):
        
        x = self.encoded_x[idx]
        y = self.hottyY(self.encoded_y[idx])
        line = self.encoder.decode(x)
        
        return {'x' : x,
                'y' : y,
                'decode' : ' '.join(line)}

In [None]:
training_dataset = rnnDataset(encoded_train, train_y, data_encoder)
training_dataloader = DataLoader(training_dataset, batch_size=64, shuffle=True)

In [None]:
import torch.nn as nn


In [None]:
class rnnModel(nn.Module):
    
    def __init__(self, vocab_size, hidden_dim, embedding_dim, output_size,
                 num_layers, pretrained=None, pretrained_vectors=None):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_dim
        self.output_size = output_size 
        
        if pretrained:
          self.embed = nn.Embedding.from_pretrained(pretrained_vectors,
                                    padding_idx=len(pretrained_vectors)-1)
        
        else:
          self.embed = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.GRU(embedding_dim, hidden_dim, num_layers ,
                           batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(hidden_dim*2, output_size)


        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        
        embeddings = self.embed(x)
        output, h_n = self.lstm(embeddings)

        concat_output = torch.cat([h_n[0,:, :], h_n[1,:,:]], dim=1)

        output2 = self.fc1(concat_output)


        return self.sigmoid(output2)

### Model Parameters 


In [None]:
embedding_dim = pretrained_dim
hidden_dim = 512
vocab_size = len(data_encoder.vectors)
output_size = 2
NUM_EPOCHS = 10
num_layers = 2
device = 'cuda'
pretrained = True
pretrained_vectors = data_encoder.vectors  

In [None]:
model = rnnModel(vocab_size, hidden_dim, embedding_dim, output_size,
                 num_layers, pretrained=pretrained,
                 pretrained_vectors=pretrained_vectors)

In [None]:
model.to(device)

rnnModel(
  (embed): Embedding(400004, 300, padding_idx=400003)
  (lstm): GRU(300, 512, num_layers=2, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=1024, out_features=2, bias=True)
  (sigmoid): Sigmoid()
)

## Training

In [None]:
from torch.optim.lr_scheduler import StepLR

In [None]:
def save_model(model, model_path, epoch_num):

    path = f'{model_path}trained_model_epoch_{epoch_num}.pt'
    torch.save(model.state_dict(), path)

In [None]:
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
scheduler = StepLR(optimizer, step_size=30, gamma=0.1)


In [None]:
model.train()

for epoch_num in tqdm(range(0, NUM_EPOCHS)):
    
    epoch_num+=1
    avg_loss = 0
    
    for idx, i in tqdm(enumerate(training_dataloader), total=len(training_dataloader)):
        
        idx+=1
        
        x = i['x'].to(device)
        y = i['y'].to(device)
        
        out = model(x).squeeze(0)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        avg_loss += loss.item()
        if idx % round((len(training_dataloader)*0.10)) == 0 and idx != 0:
            tqdm.write(f'Avg Loss: {avg_loss / idx}')
    
    scheduler.step()

    if epoch_num % 2 == 0:
      save_model(model, ROOT, epoch_num)
      tqdm.write(f'Outputted model to {ROOT} at Epoch {epoch_num}')

    tqdm.write(f'\n\n{"---"*5}\nAvg Epoch Loss: {avg_loss / idx}\n{"---"*5}\n')

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1035 [00:00<?, ?it/s]

Avg Loss: 0.43536203168332577
Avg Loss: 0.38199734737953317
Avg Loss: 0.35305733792483807
Avg Loss: 0.3406024804123892
Avg Loss: 0.3295827442350296
Avg Loss: 0.3261186270616375
Avg Loss: 0.3199544059583446
Avg Loss: 0.3151610843085039
Avg Loss: 0.31135640360223943


---------------
Avg Epoch Loss: 0.30876405567362686
---------------



  0%|          | 0/1035 [00:00<?, ?it/s]

Avg Loss: 0.26059860908068144
Avg Loss: 0.26355224013185274
Avg Loss: 0.26352596500267583
Avg Loss: 0.26477050478570163
Avg Loss: 0.2641725938050793
Avg Loss: 0.26303143279913527
Avg Loss: 0.2635822079072778
Avg Loss: 0.26272393902763724
Avg Loss: 0.262441681093003
Outputted model to /content/drive/MyDrive/toxicLanguageDetection/ at Epoch 2


---------------
Avg Epoch Loss: 0.2627775843304712
---------------



  0%|          | 0/1035 [00:00<?, ?it/s]

Avg Loss: 0.2432431375178007
Avg Loss: 0.23693705894626105
Avg Loss: 0.23710114000221857
Avg Loss: 0.24007934575470594
Avg Loss: 0.23853258182509587
Avg Loss: 0.23793122111461484
Avg Loss: 0.23988584002254756
Avg Loss: 0.24201889854605094
Avg Loss: 0.24133060708578324


---------------
Avg Epoch Loss: 0.2421869596493417
---------------



  0%|          | 0/1035 [00:00<?, ?it/s]

Avg Loss: 0.21718492136838344
Avg Loss: 0.20487522416246626
Avg Loss: 0.205089424450237
Avg Loss: 0.20748243873150876
Avg Loss: 0.20912896682723212
Avg Loss: 0.21181674256252173
Avg Loss: 0.21015729034667488
Avg Loss: 0.21093483690996295
Avg Loss: 0.21050281777309302
Outputted model to /content/drive/MyDrive/toxicLanguageDetection/ at Epoch 4


---------------
Avg Epoch Loss: 0.21100461529792794
---------------



  0%|          | 0/1035 [00:00<?, ?it/s]

Avg Loss: 0.15112325699570087
Avg Loss: 0.1504103397556509
Avg Loss: 0.15171290129327622
Avg Loss: 0.15392203595883286
Avg Loss: 0.1550451588243819
Avg Loss: 0.15530064564126617
Avg Loss: 0.15798272588546133
Avg Loss: 0.15930395932921854
Avg Loss: 0.15886766898135343


---------------
Avg Epoch Loss: 0.15903895747834357
---------------



  0%|          | 0/1035 [00:00<?, ?it/s]

Avg Loss: 0.09219808452046262
Avg Loss: 0.09505791986325327
Avg Loss: 0.093336042876427
Avg Loss: 0.0931878464583021
Avg Loss: 0.09498603516974702
Avg Loss: 0.09566642622051474
Avg Loss: 0.09655484016396228
Avg Loss: 0.09757589080142609
Avg Loss: 0.09782923524701792
Outputted model to /content/drive/MyDrive/toxicLanguageDetection/ at Epoch 6


---------------
Avg Epoch Loss: 0.09869460468472922
---------------



  0%|          | 0/1035 [00:00<?, ?it/s]

Avg Loss: 0.05164281130195237
Avg Loss: 0.05377393931848928
Avg Loss: 0.052694108516264423
Avg Loss: 0.05351126953386343
Avg Loss: 0.055693595111370085
Avg Loss: 0.05746935850877959
Avg Loss: 0.057990970386187085
Avg Loss: 0.05910641331921887
Avg Loss: 0.06146742769369744


---------------
Avg Epoch Loss: 0.06276322438737045
---------------



  0%|          | 0/1035 [00:00<?, ?it/s]

Avg Loss: 0.044084864968541436
Avg Loss: 0.04107491399582404
Avg Loss: 0.0422005441893513
Avg Loss: 0.044297887996295825
Avg Loss: 0.04481139855924994
Avg Loss: 0.045243661999195
Avg Loss: 0.04605676741262509
Avg Loss: 0.04730737499798684
Avg Loss: 0.047712769944909326
Outputted model to /content/drive/MyDrive/toxicLanguageDetection/ at Epoch 8


---------------
Avg Epoch Loss: 0.048953958480182476
---------------



  0%|          | 0/1035 [00:00<?, ?it/s]

Avg Loss: 0.041740094418995656
Avg Loss: 0.04133466646845381
Avg Loss: 0.04116357172143836
Avg Loss: 0.04051147331287771
Avg Loss: 0.04073312289121812
Avg Loss: 0.04141778183764426
Avg Loss: 0.04202260616236075
Avg Loss: 0.04285414264366129
Avg Loss: 0.04393614882689853


---------------
Avg Epoch Loss: 0.045476052223295337
---------------



  0%|          | 0/1035 [00:00<?, ?it/s]

Avg Loss: 0.041709220954754316
Avg Loss: 0.04380280160918259
Avg Loss: 0.04130872355427784
Avg Loss: 0.04051942626952391
Avg Loss: 0.04176270498025517
Avg Loss: 0.04265560635375993
Avg Loss: 0.04349881814438389
Avg Loss: 0.044892371559119335
Avg Loss: 0.045850692896230116
Outputted model to /content/drive/MyDrive/toxicLanguageDetection/ at Epoch 10


---------------
Avg Epoch Loss: 0.04688779363242217
---------------



## Evaluation 

In [None]:
eval_data = pd.read_csv('/content/drive/MyDrive/toxicLanguageDetection/data/processed_test.csv')

In [None]:
train_x, train_y = balance_dataset(eval_data, MAX_TOXIC=6243, MAX_DF=15000)

  0%|          | 0/153164 [00:00<?, ?it/s]

In [None]:
print(f"Num Toxic: {len([i for i in train_y if i ==1])}\nNum Non-toxic: {len([i for i in train_y if i ==0])}")

Num Toxic: 6243
Num Non-toxic: 15000


In [None]:
encoded_test = data_encoder.encode(test=train_x,max_len=None)
test_dataset = rnnDataset(encoded_test, train_y, data_encoder)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=32)

In [None]:
model.eval()

eval_loss = 0
eval_accuracy = 0

preds = []
stop = 20000
verbose = False 
ys = []
correct = 0

for idx, i in enumerate(tqdm(test_dataloader)):
    
    if idx == stop:
      break

    x = i['x'].to(device)
    y = i['y'].to(device)
    
    pred = model(x).squeeze(0)
    
    #loss = criterion(pred, y)
    #eval_loss+= loss.item()

    ys += y.argmax(dim=1).tolist()
    preds += pred.argmax(dim=1).tolist()
    
#     if pred.argmax().item() == y.argmax().item():
#         eval_accuracy += 1

#         if verbose:
#           print(i['decode'])
#           print('\n')
        
#     preds.append(pred.argmax().item())

# print(f'Avg Eval Loss: {eval_loss / len(test_dataset)}\nEval Accuracy: {eval_accuracy / len(test_dataset)}')

  0%|          | 0/664 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
print(classification_report(ys, preds))

              precision    recall  f1-score   support

           0       0.94      0.72      0.82     15000
           1       0.57      0.88      0.69      6243

    accuracy                           0.77     21243
   macro avg       0.75      0.80      0.75     21243
weighted avg       0.83      0.77      0.78     21243



In [None]:
cm = confusion_matrix(preds, train_y)

In [None]:
cm

array([[8224, 3374],
       [6776, 2869]])

In [None]:
model.cpu()

rnnModel(
  (embed): Embedding(400004, 300, padding_idx=400003)
  (lstm): GRU(300, 512, num_layers=2, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=1024, out_features=2, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
def test_model(model, sent):

  out = model(data_encoder.encode(test=[sent]))
  if out.argmax().item():
    print('Toxic')
  else:
    print('Not Toxic')


In [None]:
save_model(model, ROOT, 20)