In [1]:
import praw
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import os
import re
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
reddit = praw.Reddit(
    client_id="S36we03-XFf2ItQ9DHLz5Q",
    client_secret="Rnvwro-H7RebWsdrL3AfYjEgyWCegQ",
    user_agent="clinical_trials"
)

In [3]:
PATH = "../data"

In [4]:
MODEL_PATH = "../models/bert.pth"

# Acquiring data

Here I'm searching through the entirety of reddit for posts and comments related to clinical trials. I'm limited by the number of API calls allowed by Reddit, however, it's enough to train a basic sentiment analyzer on.

In [5]:
keyword = "clinical trial"

In [6]:
def username_saver(data, filename):
    username = data.author.name if data.author else "[deleted]"
    with open(os.path.join(PATH + "/names" + filename), "w", encoding="utf-8") as file:
        file.write(username)

In [7]:
def searcher(keyword):
    for subreddit_name in ["AskDocs", "healthcare", "medicine", "clinicaltrials", "clinicalresearch"]:
        subreddit = reddit.subreddit(subreddit_name)
        print("Searching posts in subreddit:", subreddit_name)
        count = 0
        for post in subreddit.search(keyword, sort='relevance', syntax='lucene', limit = 100):
            temp = str(post.title) + " " + str(post.selftext)
            temp_post_name = "/post" + str(count) + ".txt"
            with open(PATH + temp_post_name, "w", encoding="utf-8") as dataset:
                dataset.write(temp)
            username_saver(post, temp_post_name)
            count += 1
            # if count == 99:
            #     break

            post.comments.replace_more(limit=None)
            counter = 0
            for comment in post.comments.list():
                temp = str(comment.body)
                temp_comment_name = "/comment" + str(counter) + ".txt"
                with open(PATH + temp_comment_name, "w", encoding="utf-8") as dataset:
                    dataset.write(temp)
                username_saver(comment, temp_comment_name)
                counter += 1
                if counter == 100:
                    break
            if count == 99:
                break


In [21]:
searcher(keyword)

Searching posts in subreddit: AskDocs


KeyboardInterrupt: 

I have limited API calls allowed through Reddit's free tier API, I tried to get all comments, usernames and posts from the most popular subreddits for such queries. 

##### Future condiderations:
1. Create a more detailed dataframe: In order to send messages to particular users, I would create a more detailed dataframe which would include the username and the analyzed sentiment (see details below).
2. Given a sentiment, send messages to the user about their interest in clinical trials.

# Building the model

## Preprocessing

The data_loader and data_cleaner functions load the data from the dataset and clean it, respectively

In [8]:
def data_loader(path):
    data = []
    for filename in os.listdir(path):
        if not os.path.isdir(os.path.join(path, filename)) and filename != "names":
            with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
                text = file.read().strip()
                data.append(text)
    return data

def data_cleaner(df_iter):
    df_processed = []
    for text in df_iter:
        text = re.sub(r'\d+', '', text)  
        text = re.sub(r'[^\w\s]', '', text)  
        df_processed.append(text)
    return df_processed


In [9]:
dataset = data_cleaner(data_loader(PATH))

### Using pre-trained BERT with our acquired data for fine tuning

I'm limited by the data that I have, so I am using an uncased - BERT (doesn't differentiate between uppercase and lowercase letters), which is pretrained on la large corpus of English words. This model is fine tuned on the scraped data.

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = [tokenizer.encode(post, add_special_tokens=True, max_length=512, truncation=True) for post in dataset]



In [11]:
max_len = max([len(post) for post in tokenized_texts])
padded_tokenized_texts = [post + [0]*(512-len(post)) for post in tokenized_texts]

In [12]:
input_ids = torch.tensor(padded_tokenized_texts)

In [13]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
EPOCHS = 1
BATCH_SIZE = 32
LEARNING_RATE = 2e-5

In [15]:
dataset = TensorDataset(input_ids)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [16]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

In [17]:
device = "cpu"

In [18]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0]}
        outputs = model(**inputs)
        logits = outputs[0]
        optimizer.zero_grad()
        loss = criterion(logits, torch.zeros(logits.shape[0], dtype=torch.long, device=device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Average loss for epoch {epoch+1}: {total_loss / len(dataloader)}")

Epoch 1:   0%|                                                                                  | 0/15 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Epoch 1: 100%|██████████████████████████████████████████████████████████████████████| 15/15 [1:07:07<00:00, 268.53s/it]

Average loss for epoch 1: 0.19224151223897934





Not overly impressive results-but we won't know for sure unless we test it out.

In [19]:
torch.save(model.state_dict(), MODEL_PATH)

the model has been trained, let's check out the architecture of the network

In [20]:
model.load_state_dict(torch.load(MODEL_PATH))
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
predictions = []

In [None]:
for batch in tqdm(dataloader, desc="Testing"):
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs[0]
    predicted_classes = torch.argmax(logits, dim=1).tolist()  # Convert to list instead of item()
    predictions.extend(predicted_classes)

In [None]:
print(predictions)

In [None]:
pd.DataFrame(
    dict(
        zip(
            data_cleaner(data_loader(PATH)), predictions
        )
    ).items(),
    columns=['Data', 'Predictions']
)

It seems that the fine-tuned BERT model is not robust. Here's what I would do to make a robust model.
1. Pre-train BERT on large amounts of data
2. Use an annotator such as Scale AI's Data Engine to label a few samples (Huggingface recommends about 1000)
3. Fine tune BERT on the labelled data and get a robust predictor.