In [1]:
import classification as clf
import regression as rf
import torch
import numpy as np
import pandas as pd
import lime
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
from lime import lime_text
from lime.lime_text import LimeTextExplainer

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel

2024-04-15 16:12:52.750291: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


In [2]:
FILEPATH_TRAIN = '../res/train.csv'
FILEPATH_TEST = '../res/test.csv'

MAX_LENGTH = 256
BATCH_SIZE = 64

train =clf.load_dataframe(FILEPATH_TRAIN) 
test = clf.load_dataframe(FILEPATH_TEST)

In [3]:
 device = (
        'cuda'
        if torch.cuda.is_available()
        else 'cpu'
    )
print(f'Using {device} device')

Using cpu device


In [4]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [5]:
train_dataset = clf.ReviewDataset(FILEPATH_TRAIN, tokenizer)
print(f'{len(train_dataset)} training samples loaded')

test_dataset = clf.ReviewDataset(FILEPATH_TEST, tokenizer)
print(f'{len(test_dataset)} testing samples loaded')

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

model = clf.RatingModel(hidden_size=1_000).to(device)

72608 training samples loaded
24315 testing samples loaded


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [6]:
print(model)

RatingModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [7]:
PATH_BINARY = "./binaryclass_model_20240414_103223_8"
model.load_state_dict(torch.load(PATH_BINARY,map_location=torch.device('cpu')))

<All keys matched successfully>

In [8]:
def batch_predict(reviews):
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    reviews_inputs = {}
    ids = torch.LongTensor()
    mask = torch.LongTensor()
    n = len(reviews)
    for review in reviews:
        tokens = tokenizer(
            review,
            return_tensors='pt',
            padding='max_length',
            max_length=256,
            truncation=True)
        input_ids = torch.squeeze(tokens['input_ids'])
        attention_mask = torch.squeeze(tokens['attention_mask'])
        ids = torch.cat((ids ,input_ids),dim=0)
        mask = torch.cat((mask, attention_mask),dim=0)
    ids = ids.view(n,-1).type(torch.LongTensor)
    mask = mask.view(n,-1).type(torch.LongTensor)
    reviews_inputs = dict(input_ids=ids, attention_mask=mask)
    model.eval()
    reviews_inputs['input_ids'] = reviews_inputs['input_ids'].to(device)
    reviews_inputs['attention_mask'] = reviews_inputs['attention_mask'].to(device)
    outputs = model(reviews_inputs)
    return outputs.detach().cpu().numpy()

In [9]:
unique_drugs, freq_drugs = np.unique(test.drug_name.values, return_counts=True)

In [10]:
len(unique_drugs)

2637

In [11]:
def get_freq(x_number):
    x = np.random.randint(low=0, high=2130)
    val= freq_drugs[x]
    while val < x_number:
        x = np.random.randint(low=0, high=2130)
        val = freq_drugs[x]
        drug = unique_drugs[x]
    print(val, drug)

In [12]:
get_freq(20)

53 moviprep


Get reviews by drug: 

In [13]:
# First get rating 1 and 10
test = test[(test.rating == 1) | (test.rating == 10)]
test['rating'] = np.where(test.rating < 5, 0,1)
rev_drug = test[test.drug_name == 'restoril']
revws = rev_drug.review.values

In [14]:
rev_drug

Unnamed: 0,drug_name,condition,review,rating,useful_count
178,restoril,insomnia,Due to major anxiety and stress can&#039;t sle...,0,23
8480,restoril,insomnia,This medicine helps me relax won&#039;t feel d...,1,43
11794,restoril,insomnia,I took this medicine for the first time last n...,0,10
12713,restoril,insomnia,Temazapam is the only med that treats my insom...,1,79
24758,restoril,insomnia,I feel like I&#039;ve been on very sleeping ai...,1,75
25381,restoril,insomnia,I could easily fall asleep within 10-15 minute...,1,124
34726,restoril,insomnia,I was prescribed this medication yesterday aft...,0,77
35175,restoril,insomnia,Restoril helps me get to sleep and helps preve...,1,47
38309,restoril,insomnia,I have been taking Restoril for 2 weeks now wi...,0,35
45543,restoril,insomnia,This is the best sleeping pill I&#039;ve ever ...,1,208


In [15]:
probs = batch_predict(revws)

In [16]:
probs.shape

(12, 2)

In [17]:
probs[7]

array([0.21326236, 0.72694135], dtype=float32)

In [18]:
idx = 7
class_names = ["negative", "positive"]
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(revws[idx], batch_predict, num_samples=200 ,num_features=6)
print('Document id: %d' % idx)
print('Probability(positive) =', batch_predict([revws[idx]])[0,1])
print('True class: %s' % class_names[rev_drug.rating.iloc[idx]] )

Document id: 7
Probability(positive) = 0.7269413
True class: positive


In [19]:
exp.as_list()

[('helps', 0.19044115079969853),
 ('no', 0.12514235480081307),
 ('side', 0.057565135283629845),
 ('have', 0.05241235545698793),
 ('me', -0.05064154557475502),
 ('prevent', -0.036142379420220146)]