Run first cell and cells under Import. If data files are already saved, skip the cells in Generate.

If generating data, always run the cells under the Load header in Generate, then run one of the headers

If you dont want to save the files to google drive, change `DATA_FOLDER` to a local path

In [None]:
# DATA_FILE = "common10"
# EMBEDDING_TYPE = "cooc_pocket_100"

DATA_FILE = "nonAnalytic"
EMBEDDING_TYPE = "mpnet_abs"

DATA_FOLDER = '/content/drive/MyDrive/debateData/'
DATA_PATH =  DATA_FOLDER + DATA_FILE

# Import

In [None]:
if DATA_FOLDER.startswith("/content/drive/MyDrive/"):
  from google.colab import drive
  from os import mkdir
  from pathlib import Path

  drive.mount('/content/drive', force_remount=True)
  Path(DATA_FOLDER).mkdir(exist_ok=True)

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from scipy import sparse
import itertools

%matplotlib inline

# Generate

## Load

In [None]:
%%capture
%pip install datasets

In [None]:
from datasets import load_dataset
all_evidence = load_dataset("Yusuf5/OpenCaselistTMP", split="train").to_pandas().set_index('id')

In [None]:
if DATA_FILE == "common10":
  common10_evidence = all_evidence[all_evidence.duplicateCount >= 10]
  evidence = common10_evidence
elif DATA_FILE == "nonAnalytic":
  nonAnalytic_evidence = all_evidence.loc[all_evidence.textLength > 0]
  evidence = nonAnalytic_evidence
else:
  evidence = all_evidence
bucketCards = evidence.drop_duplicates("bucketId")
bucketCards.bucketId.to_csv(DATA_PATH + "_bucketIds.csv")
print(f"{len(bucketCards)} Buckets Loaded")

570452 Buckets Loaded


## LI Labels

In [None]:
evidence = all_evidence

In [None]:
from functools import reduce
def checkHeaders(data: pd.DataFrame, search: str, tag=False):
  search = search.lower()
  matches = [data[header].str.lower().str.contains(search) for header in ['pocket', 'hat', 'block'] + (['tag'] if tag else [])]
  return reduce(lambda a, b: a | b, matches)

In [None]:
links = checkHeaders(evidence, 'link')
impacts = checkHeaders(evidence, 'impact')
liInfo = pd.DataFrame({
  'bucketId': evidence.bucketId,
  'link': links,
  'impact': impacts
})

In [None]:
liCounts = liInfo.groupby('bucketId').sum()
duplicate = liCounts.loc[(liCounts.link != 0) & (liCounts.impact != 0)]
duplicateRatios = np.log2(duplicate.link / duplicate.impact)

In [None]:
linkBuckets = liInfo[liInfo.link].bucketId.values
impactBuckets = liInfo[liInfo.impact].bucketId.values

realLinks = np.union1d(np.setdiff1d(linkBuckets, impactBuckets), duplicateRatios[duplicateRatios > 1].index)
realImpacts = np.union1d(np.setdiff1d(impactBuckets, linkBuckets), duplicateRatios[duplicateRatios < -1].index)

In [None]:
evidence['link'] = evidence.bucketId.isin(realLinks)
evidence['impact'] = evidence.bucketId.isin(realImpacts)

In [None]:
evidence.drop_duplicates("bucketId").reset_index()[['link', 'impact', 'bucketId']].to_feather(DATA_FOLDER + "all_li.feather")
print(f"Saved {len(realLinks)} links and {len(realImpacts)} impacts")

Saved 67503 links and 57079 impacts


## Co-occurence Embeddings

In [None]:
window_level = "block"
dimension = 100

In [None]:
bucketIds = bucketCards.reset_index().bucketId
bucketIndexes = pd.Series(bucketIds.index, bucketIds.values)

In [None]:
from scipy import sparse
import itertools

levels = ['fileId', 'pocket', 'hat', 'block']
def build_co_occurrence():
  M = sparse.lil_matrix((len(bucketIds), len(bucketIds)), dtype=np.int32)

  groups = evidence.groupby(levels[0:levels.index(window_level) + 1])
  for name, group in tqdm(groups):
    # Sparse matricies cant really vecotrize this
    for edge in itertools.combinations(bucketIndexes[group.bucketId], 2):
      M[edge] += 1
  return (M + M.T).tocsr()

In [None]:
# Progress bar shows more iterations than it should
try:
  co_occurence = sparse.load_npz(f'{DATA_PATH}_cooc_{window_level}.npz').tocsr()
except FileNotFoundError:
  co_occurence = build_co_occurrence()
  sparse.save_npz(f'{DATA_PATH}_cooc_{window_level}.npz', co_occurence)

In [None]:
def total_to_divideMatrix(totals: np.array):
  with np.errstate(divide='ignore'):
    M = 1 / totals
  M[np.isinf(M)] = 0
  return sparse.diags(M)

def build_ppmi(M: sparse.csr_matrix):
  probabilities = M.copy()

  # Will be same when symmetric
  word_totals = np.array(co_occurence.sum(axis=0))[0]
  context_totals = np.array(co_occurence.sum(axis=1))[:, 0]

  # pmi_wc = log(P(w, c) / (P(w) * P(c)))
  #        = log((#(w, c) / total) / ((#(w) / total) * (#(c) / total)))
  #        = log(#(w, c) * total) / (#(w) * #(c))

  #(w, c) * total
  probabilities *= word_totals.sum()

  word_divider = total_to_divideMatrix(word_totals)
  context_divider = total_to_divideMatrix(context_totals)
  # / ((#w) * (#c))
  probabilities = word_divider @ probabilities @ context_divider # Divide each row by word_divider, column by context_divider

  probabilities.data = np.maximum(probabilities.data, 1) # Clamps log to 0 for ppmi
  probabilities.data = np.log2(probabilities.data)
  return probabilities

In [None]:
ppmi = build_ppmi(co_occurence)

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

# Takes a few minutes
n_iters = 10
svd = TruncatedSVD(n_components=dimension, n_iter=n_iters)
embeddings = svd.fit_transform(ppmi)
embeddings = normalize(embeddings)

In [None]:
np.save(f'{DATA_PATH}_embeddings_cooc_{window_level}_{dimension}', embeddings)

## Transformer Embeddings

### Install

In [None]:
%%capture
%pip install sentence-transformers

### Index

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
# Time estimate will be way to long for a while
sbertEmbeddings = model.encode(list(bucketCards.tag), show_progress_bar=True, normalize_embeddings=True)

In [None]:
np.save(DATA_PATH + "_embeddings_mpnet_abs.npy", sbertEmbeddings)

# Model

In [None]:
batch_size = 256
hidden_size = 200
learning_rate = 1e-3
epochs = 40

## Load Data

In [None]:
import torch
from torch.utils.data import DataLoader, random_split

In [None]:
embeddings = np.load(f'{DATA_PATH}_embeddings_{EMBEDDING_TYPE}.npy')
bucketIds = pd.read_csv(DATA_PATH + '_bucketIds.csv').bucketId
bucketIndexes = pd.Series(bucketIds.index, bucketIds.values)

In [None]:
liLabels = pd.read_feather(DATA_FOLDER + "all_li.feather").set_index("bucketId")
liLabels = liLabels.loc[bucketIds]

In [None]:
labeled = liLabels[liLabels.link | liLabels.impact].index
labeledIndexes = pd.Series(np.arange(len(labeled)), labeled)
inputs = torch.tensor(embeddings[bucketIndexes[labeled]], dtype=torch.float32)

In [None]:
labels = torch.zeros((len(labeled)), dtype=torch.long)
labels[labeledIndexes[liLabels.index[liLabels.link]].values] = 0
labels[labeledIndexes[liLabels.index[liLabels.impact]].values] = 1

In [None]:
data = list(zip(inputs, labels))
train_split, validate_split, test_split = random_split(data, [0.8, 0.1, 0.1], generator=torch.Generator().manual_seed(42))
train_loader = DataLoader(train_split, batch_size=batch_size, shuffle=True, pin_memory=True)
validate_loader = DataLoader(validate_split, batch_size=1)
inputs.shape

## Model

In [None]:
import torch.nn as nn

class Classifier(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes):
    super(Classifier, self).__init__()

    self.hidden_size = hidden_size
    self.model = nn.Sequential(
      nn.Linear(input_size, hidden_size),
      nn.ReLU(),
      nn.Dropout(0.75),
      nn.Linear(hidden_size, num_classes),
      nn.LogSoftmax(dim=1)
    )

  def forward(self, x):
    return self.model(x)

## Setup

In [None]:
import torch.optim as optim
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
train_loss_history = []
validate_loss_history = []

In [None]:
def train_epoch(model: nn.Module, optimizer: optim.Optimizer, epoch: int, log_freq=0):
  model.train()
  train_loss = 0
  for i, (data, target) in enumerate(train_loader):
    data, target = data.to(device), target.to(device)
    optimizer.zero_grad()

    ouputs = model(data)

    loss = F.nll_loss(ouputs, target)
    loss.backward()

    train_loss += loss.item() * len(data)
    optimizer.step()

    if log_freq and (i + 1) % log_freq == 0:
      percentage = 100 * (i + 1) * len(data) / len(train_loader.dataset)
      print(f'Epoch: {epoch} [{(i + 1) * len(data)}/{len(train_loader.dataset)} ({percentage:.3f}%)], Loss: {loss.item():.6f}')
  train_loss_history.append(train_loss / len(train_loader.dataset))

In [None]:
def validate(model: nn.Module, epoch: int):
  model.eval()
  validate_loss = 0
  correct = 0
  with torch.no_grad():
    for data, target in validate_loader:
      data, target = data.to(device), target.to(device)
      output = model(data)
      validate_loss += F.nll_loss(output, target).item()
      prediction = output.argmax(dim=1, keepdim=True)
      correct += prediction.eq(target).sum().item()

  validate_loss /= len(validate_loader.dataset)
  validate_loss_history.append(validate_loss)

  print(f"Validate set (Epoch {epoch}): Average Loss {validate_loss:.4f}, Accuracy: {correct}/{len(validate_loader.dataset)} ({100 * correct / len(validate_loader.dataset):.1f}%)")

## Run

In [None]:
model = Classifier(inputs.shape[1], hidden_size, 2).to(device)
adamW = optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
train_loss_history = []
validate_loss_history = []
for i in range(0, 0 + epochs):
  train_epoch(model, adamW, i)
  validate(model, i + 1)

In [None]:
plt.close()
plt.plot(train_loss_history, label='train')
plt.plot(validate_loss_history, label='validate')
plt.legend()
plt.show()