In [1]:
import re
import os
import difflib
import random, pickle
import numpy as np
import pandas as pd
from tika import parser
import torch
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

In [2]:
def remove_punc(pdf_content):
    punc = ['• ', '· ', '&', '~', ' o ', '\uf0a7', '\uf03c', '\uf0b7', 
            '–', '()', '[…]', '| ', '© ', '(Insert Scale)', '_', '%', '[', ']', 'Ü ']
    for p in punc:
        pdf_content = pdf_content.replace(p, '')
    return pdf_content

def remove_bulleted_points(pdf_content):
    pdf_content = re.sub(r'\.+ [0-9]+', '.', pdf_content)
    pdf_content = re.sub(r'\.+[0-9]+', '.', pdf_content)
    pdf_content = re.sub(r'\.+', '.', pdf_content)

    pdf_content = re.sub(r'\([0-9]+\)', '', pdf_content)
    pdf_content = re.sub(r'[0-9]+\)', '', pdf_content)
    pdf_content = re.sub(r'[0-9]+.', '', pdf_content)
    pdf_content = re.sub(r'\([a-zA-Z]\)', '', pdf_content)
    pdf_content = re.sub(r' [a-zA-Z]\)', '', pdf_content)
    pdf_content = re.sub(r'\(i+\)', '', pdf_content)
    pdf_content = re.sub(r' i+\)', '', pdf_content)

    pdf_content = re.sub('\s\s+', ' ', pdf_content)
    return pdf_content

def remove_url(pdf_content):
    url = re.findall('http[s]?://\S+', pdf_content)
    for u in url:
        pdf_content = pdf_content.replace(u, '')
    url = re.findall('www.\S+', pdf_content)
    for u in url:
        pdf_content = pdf_content.replace(u, '')
    pdf_content = re.sub(r'http[s]?://', '', pdf_content)
    return pdf_content

def filter_sentences_by_length(pdf_sentence):
    return [s for s in pdf_sentence if len(word_tokenize(s)) > 4 and len(word_tokenize(s)) < 200]

In [3]:
recs = pd.read_csv('cleaned_recs.csv')[['Document File Name ', 'Recommendation text']].dropna(0, 'all')
file_mapping = pd.read_csv("file_mapping.csv")
merged = pd.merge(recs, file_mapping, left_on="Document File Name ", right_on="original_name", how='inner')
test = merged.loc[(merged.indexed_name == '12.pdf') | (merged.indexed_name == '9.pdf')]
train = merged.loc[~((merged.indexed_name == '12.pdf') | (merged.indexed_name == '9.pdf'))]

# sentences = []
# indexed_corpus = os.path.join("..", "indexed_corpus")
# for i in range(1, 16):
    
#     pdf_path = os.path.join(indexed_corpus, f"{i}.pdf")
#     parsed_pdf = parser.from_file(pdf_path)
#     pdf_content = parsed_pdf['content'].replace('\n', ' ').replace(';', '.').strip()
#     pdf_content = remove_punc(pdf_content)
#     pdf_content = remove_bulleted_points(pdf_content)
#     pdf_content = remove_url(pdf_content)
#     pdf_content = remove_punc(pdf_content)
#     pdf_content = re.sub(r'\.+', '.', pdf_content)
#     pdf_content = re.sub(r'\s\s+', ' ', pdf_content)
    
#     pdf_sentence = sent_tokenize(pdf_content)
#     filtered_sentence = filter_sentences_by_length(pdf_sentence)
#     sentences += filtered_sentence

# len(sentences)

  recs = pd.read_csv('cleaned_recs.csv')[['Document File Name ', 'Recommendation text']].dropna(0, 'all')


In [21]:
def save_list_to_pickle(data, filename):
    with open(filename, 'wb') as file:
        pickle.dump(data, file)

save_list_to_pickle(merged['Recommendation text'].to_list(), "recs.pkl")

In [4]:
def load_list_from_pickle(filename):
    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data

# Example usage:
filename = "sentences.pkl"
sentences = load_list_from_pickle(filename)

In [5]:
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 2)

In [6]:
def retrieve_sentence_index(sentence, sentence_list):
    # Tokenize the sentences
    sentence_tokens = sentence.split()
    sentence_list_tokens = [s.split() for s in sentence_list]
    
    # Calculate the similarity between the sentences
    similarity_scores = [difflib.SequenceMatcher(None, sentence_tokens, s).ratio() for s in sentence_list_tokens]
    
    # Find the index of the most similar sentence
    max_similarity_index = similarity_scores.index(max(similarity_scores))
    
    return max_similarity_index

In [7]:
train_indices = [retrieve_sentence_index(sentence, sentences) for sentence in train.iloc[:, 1]]
test_indices = [retrieve_sentence_index(sentence, sentences) for sentence in test.iloc[:, 1]]

In [8]:
len(train_indices), len(test_indices)

(100, 9)

In [9]:
train_recs = [sentences[idx] for idx in set(train_indices)]
test_recs = [sentences[idx] for idx in set(test_indices)]

non_recs = []
while len(non_recs) != 125:
    samp_idx = np.random.choice(len(sentences))
    if (samp_idx not in train_indices + test_indices) and (len(sentences[samp_idx].split()) > 10):
        non_recs.append(sentences[samp_idx])

In [10]:
train_non_recs = non_recs[:110]
test_non_recs = non_recs[110:]

train_texts = train_non_recs + train_recs
test_texts = test_non_recs + test_recs

train_labels = [0] * len(train_non_recs) + [1] * len(train_recs)
test_labels = [0] * len(test_non_recs) + [1] * len(test_recs)

train_combined = list(zip(train_texts, train_labels))
test_combined = list(zip(test_texts, test_labels))

# # Shuffle the combined lists
# random.shuffle(train_combined)
# random.shuffle(test_combined)

In [11]:
class RecsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Unzip the shuffled combined lists
texts, labels = zip(*train_combined)
test_texts, test_labels = zip(*test_combined)

# Split the data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Define the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Define the dataset and data loaders
train_dataset = RecsDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = RecsDataset(val_texts, val_labels, tokenizer, max_length=128)
test_dataset = RecsDataset(test_texts, test_labels, tokenizer, max_length=128)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Define the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-6)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
epochs = 25

# Create logger

f = open(os.path.join("logs", "baseline_logger1.txt"), 'w')

best_val_loss = 100

for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        
        loss = loss_fn(logits, labels)
        train_loss += loss.item()
        train_correct += (predicted == labels).sum().item()
        
        loss.backward()
        optimizer.step()
    
    train_accuracy = 100.0 * train_correct / len(train_dataset)
    train_loss /= len(train_dataloader)

    model.eval()
    val_loss = 0.0
    val_correct = 0
    
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            _, predicted = torch.max(logits, 1)
            
            loss = loss_fn(logits, labels)
            val_loss += loss.item()
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100.0 * val_correct / len(val_dataset)
    val_loss /= len(val_dataloader)

    if val_loss < best_val_loss:
        torch.save(model.state_dict(), os.path.join("weights", 'baseline1.pt'))
        best_val_loss = val_loss    

    f.write(f'Epoch {epoch + 1}/{epochs}\n')
    f.write(f'Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.2f}%\n')
    f.write(f'Val Loss: {val_loss:.4f} | Val Accuracy: {val_accuracy:.2f}%\n')
    f.write('-------------------------------------------\n')


# Load the best model weights
model.load_state_dict(torch.load(os.path.join("weights", 'baseline1.pt')))
model.eval()

test_loss = 0.0
# test_correct = 0

all_predicted_labels = []
all_true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        
        loss = loss_fn(logits, labels)
        test_loss += loss.item()
        # test_correct += (predicted == labels).sum().item()

        all_predicted_labels.extend(predicted.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(all_true_labels, all_predicted_labels) * 100
test_precision = precision_score(all_true_labels, all_predicted_labels)
test_recall = recall_score(all_true_labels, all_predicted_labels)

test_loss /= len(test_dataloader)

f.write(f'TESTING\n')
f.write(f'Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.2%}\n')
f.write(f'Test Precision: {test_precision:.4f} | Test Recall: {test_recall:.2f}\n')
f.write('-------------------------------------------\n')


f.close()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier

## Data Augmentation (Back-translation // TF-IDF Replacement)