## Load Encoder

In [1]:
import pickle 
from utils.dataEncoder import DataEncoder

In [2]:
with open('./dataEncoder.pkl', 'rb') as file:
    
    data_encoder = pickle.load(file)
    


## Load Model

In [3]:
from utils.model import rnnModel
import torch

In [4]:
model_path = './models/trained_model_epoch_20.pt'

model = torch.load(model_path)

### Config


In [5]:
embedding_dim = data_encoder.pretrained_dim
hidden_dim = 512
vocab_size = len(data_encoder.vectors)
output_size = 2
num_layers = 2
device = 'cpu'
pretrained = True
pretrained_vectors = data_encoder.vectors  

In [6]:
model = rnnModel(vocab_size, hidden_dim, embedding_dim, output_size,
                num_layers, pretrained=pretrained, pretrained_vectors=data_encoder.vectors)
model.load_state_dict(torch.load(model_path))
model.eval()

rnnModel(
  (embed): Embedding(400004, 300, padding_idx=400003)
  (lstm): GRU(300, 512, num_layers=2, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=1024, out_features=2, bias=True)
  (sigmoid): Sigmoid()
)

In [7]:
def test_model(model, sent):

    out = model(data_encoder.encode(test=[sent]))
    if out.argmax().item():
        print('Toxic')
    else:
        print('Not Toxic')


In [8]:
def balance_dataset(df, MAX_TOXIC=None, MAX_DF=None):
    
    if not MAX_TOXIC:
        MAX_TOXIC = len(df)
    
    if not MAX_DF:
        MAX_DF = len(df)
        
    data = {'toxic'     : [],
            'non_toxic' : []}

    max_num_tox = 0

    for idx in tqdm(range(len(df))):
        
        row = df.loc[idx]

        if row.toxic == 0:
            
            if len(data['non_toxic']) >= MAX_DF:
                continue
                
            data['non_toxic'].append((row.comment,0))
            
        elif row.toxic == 1:
            
            if len(data['toxic']) >= MAX_TOXIC:
                continue
                
            data['toxic'].append((row.comment,1))
         
    dataset = list(data['toxic'] + data['non_toxic'])
    random.shuffle(dataset)
    
    return [i[0] for i in dataset], [i[1] for i in dataset]

## Load Testing Data

In [9]:
import pandas as pd
from tqdm.auto import tqdm
import random

In [10]:
eval_data = pd.read_csv('./data/processed_test.csv')
train_x, train_y = balance_dataset(eval_data, MAX_TOXIC=6243, MAX_DF=15000)

  0%|          | 0/153164 [00:00<?, ?it/s]

In [11]:
from torch.utils.data import DataLoader
import torch
from utils.preprocessing import rnnDataset

In [12]:
encoded_test = data_encoder.encode(test=train_x,max_len=None)
test_dataset = rnnDataset(encoded_test, train_y, data_encoder)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=1)

In [16]:
eval_loss = 0
eval_accuracy = 0

preds = []
stop = 20
verbose = False 
ys = []
correct = 0

for idx, i in enumerate(tqdm(test_dataloader)):
    
    if idx == stop:
      break

    x = i['x'].to(device)
    y = i['y'].to(device)
    
    pred = model(x)
    ys += y.argmax(dim=1).tolist()
    preds += pred.argmax(dim=1).tolist()
    

  0%|          | 0/21243 [00:00<?, ?it/s]

In [17]:
preds

[1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0]

In [18]:
ys

[1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1]

In [26]:
from sklearn.metrics import confusion_matrix, classification_report

In [24]:
cm = confusion_matrix(ys, preds)

In [27]:
print(classification_report(ys, preds))

              precision    recall  f1-score   support

           0       0.80      0.73      0.76        11
           1       0.70      0.78      0.74         9

    accuracy                           0.75        20
   macro avg       0.75      0.75      0.75        20
weighted avg       0.76      0.75      0.75        20

