In [None]:
# imports
from torch.utils.data import Dataset
import os.path
from pathlib import Path
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
import json
import abc
from sklearn import svm
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import gensim
from gensim.models import Word2Vec
from sklearn.metrics import mean_absolute_error
from torch.utils.data import DataLoader, WeightedRandomSampler
import torch.nn as nn
from torch.autograd import Variable
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.svm import SVR
import xgboost as xgb
from torch.optim.lr_scheduler import StepLR
from torch.optim.lr_scheduler import ReduceLROnPlateau
import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc
from sklearn.preprocessing import label_binarize
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight
from transformers import RobertaTokenizer, RobertaModel
from sklearn.metrics import f1_score
import seaborn as sns
from imblearn.over_sampling import SMOTE

# Whole Text Embeddings

In [None]:
class BaseClass(metaclass=abc.ABCMeta):
    def __init__(self, data_dir):
        self.transcript_dir = os.path.join(data_dir, "CCM_Transcript_Text_Data")
        self.file_list = sorted([str(filePath) for filePath in Path(self.transcript_dir).glob("**/*") if filePath.is_file()])

model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)
baseInfo = BaseClass("data")

In [None]:
class CustomTrainDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return open(self.data[idx], "r", encoding="utf-8").read()
    
class CustomTestDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return open(self.data[idx], "r", encoding="utf-8").read()

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

json_file_path = "text_labels_dict.json"
with open(json_file_path, 'r') as json_file:
    labels = json.load(json_file)

x_files = []
y = []
for file in baseInfo.file_list:
    x_file_name = file.replace("data\\CCM_Transcript_Text_Data\\", "")
    if x_file_name in labels:
        x_files.append(file)
        y.append(labels[x_file_name])

data_loader = DataLoader(CustomTrainDataset(x_files), batch_size=128, shuffle=False)

In [None]:
x_embeddings = []
model.eval()  # Put model in evaluation mode
with torch.no_grad():
    for data in data_loader:
        # Tokenize sentences and create attention masks
        inputs = tokenizer(data, padding=True, truncation=True, return_tensors='pt', max_length=512)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        # Get raw embeddings from the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Apply mean pooling to get sentence embeddings
        embeddings = mean_pooling(outputs, attention_mask)
        x_embeddings.append(embeddings)

x_embeddings = torch.cat(x_embeddings, axis=0)

X_np = x_embeddings.numpy()
y_np = y.numpy()
np.savez(f'X-y-whole-text-embeddings.npz', X=X_np, y=y_np)

# Sequence Embeddings

In [None]:
max_line_length = 0
for textfile in x_files:
    with open(textfile, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        max_line_length = max(max_line_length, len(lines))
print("max line length: ", max_line_length)

X = torch.zeros(len(y), max_line_length, 768)
model.eval()
currIndex = 0
with torch.no_grad():
    for data in x_files:
        sentence_split_data = [line.strip() for line in open(data, 'r')]
        print(sentence_split_data)
        inputs = tokenizer(sentence_split_data, padding=True, truncation=True, return_tensors='pt', max_length=512)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        # Get raw embeddings from the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Apply mean pooling to get sentence embeddings
        embeddings = mean_pooling(outputs, attention_mask)
    
        print("number of lines in e: ", embeddings.shape[0])

        num_vectors_to_pad = max_line_length - embeddings.shape[0]
        zero_padding = torch.zeros(num_vectors_to_pad, embeddings.shape[1])
        padded_tensor = torch.cat((embeddings, zero_padding), dim=0)
        X[currIndex] = padded_tensor
        currIndex += 1

X = X.numpy()
np.savez(f'X-y-sequential-text-embeddings.npz', X=X_np, y=y_np)