In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
import string
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
nltk.download('punkt')

import numpy as np
import pandas as pd
import re

from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize

from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
import torch.optim as optim


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from transformers import AutoTokenizer, AutoModel

# Load SciBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/archive/train.csv")
val_df = pd.read_csv("/content/drive/MyDrive/archive/validation.csv")
test_df = pd.read_csv("/content/drive/MyDrive/archive/test.csv")

In [None]:
def vectorise(texts, w2v_model, max_length=40):
    vector_size = w2v_model.vector_size
    texts_vec = []

    words = set(w2v_model.index_to_key)

    for text in texts:
        sentence_vectors = []

        # 遍历句子中的每个词，最多到 max_length
        for word in text[:max_length]:
          if word in words:
              # 如果词在模型的词汇表中，添加它的向量
              sentence_vectors.append(w2v_model[word])
          else:
              # 否则，添加零向量
              sentence_vectors.append(np.zeros(vector_size))

        # 如果句子长度小于 max_length，用零向量填充剩余的部分
        for _ in range(max_length - len(sentence_vectors)):
            sentence_vectors.append(np.zeros(vector_size))

        # 将句子的词向量列表添加到结果列表中
        texts_vec.append(sentence_vectors)

    # 将列表转换为三维 NumPy 数组
    # 结果形状为 (句子数量, max_length, vector_size)
    return np.array(texts_vec)

In [None]:
def process_strings(strings):
    strings = re.sub(r'\[[0-9, ]*\]', '', strings)
    strings = re.sub(r'^...', '... ', strings)
    strings = word_tokenize(strings.lower())
    return strings

def process_names(sectionNames):
    returned = []
    for case in sectionNames:
        print(case)
        case = case.lower()
        case = re.sub(r'^[0-9.]{2,}', '', case)
        returned.append(case)
    return returned

In [None]:
def preprocess_sectionName(sectionName):
    sectionName = str(sectionName)
    newSectionName = sectionName.lower()

    if newSectionName != None:
        if "introduction" in newSectionName or "preliminaries" in newSectionName:
            newSectionName = "introduction"
        elif "result" in newSectionName or "finding" in newSectionName:
            newSectionName = "results"
        elif "method" in newSectionName or "approach" in newSectionName:
            newSectionName = "method"
        elif "discussion" in newSectionName:
            newSectionName = "discussion"
        elif "background" in newSectionName:
            newSectionName = "background"
        elif "experiment" in newSectionName or "setup" in newSectionName or "set-up" in newSectionName or "set up" in newSectionName:
            newSectionName = "experiment"
        elif "related work" in newSectionName or "relatedwork" in newSectionName or "prior work" in newSectionName or "literature review" in newSectionName:
            newSectionName = "related work"
        elif "evaluation" in newSectionName:
            newSectionName = "evaluation"
        elif "implementation" in newSectionName:
            newSectionName = "implementation"
        elif "conclusion" in newSectionName:
            newSectionName = "conclusion"
        elif "limitation" in newSectionName:
            newSectionName = "limitation"
        elif "appendix" in newSectionName:
            newSectionName = "appendix"
        elif "future work" in newSectionName or "extension" in newSectionName:
            newSectionName = "appendix"
        elif "analysis" in newSectionName:
            newSectionName = "analysis"
        else:
            newSectionName = "unspecified"

        return newSectionName

In [None]:
train_df["sectionName"] = train_df["sectionName"].apply(preprocess_sectionName)
val_df["sectionName"] = val_df["sectionName"].apply(preprocess_sectionName)
test_df["sectionName"] = test_df["sectionName"].apply(preprocess_sectionName)

In [None]:
sec_name_mapping = {"discussion": 0, "introduction": 1, "unspecified": 2, "method": 3,
                    "results": 4, "experiment": 5, "background": 6, "implementation": 7,
                    "related work": 8, "analysis": 9, "conclusion": 10, "evaluation": 11,
                    "appendix": 12, "limitation": 13}

In [None]:
train_df_clean = train_df[["string", "sectionName", "label"]]
val_df_clean = val_df[["string", "sectionName", "label"]]
test_df_clean = test_df[["string", "sectionName", "label"]]

In [None]:
train_sec = train_df_clean['sectionName'].tolist()
train_sec = [sec_name_mapping[label] for label in train_sec]
train_text = train_df_clean['string']
train_labels = train_df_clean["label"]

val_sec = val_df_clean['sectionName'].tolist()
val_sec = [sec_name_mapping[label] for label in val_sec]
val_text = val_df_clean['string']
val_labels = val_df_clean["label"]

test_sec = test_df_clean['sectionName'].tolist()
test_sec = [sec_name_mapping[label] for label in test_sec]
test_text = test_df_clean['string']
test_labels = test_df_clean["label"]

In [None]:
max_seq_len = 75

In [None]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = max_seq_len,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = max_seq_len,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = max_seq_len,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False
)

In [None]:
import torch
import torch.nn as nn

In [None]:
train_ids = torch.tensor(tokens_train["input_ids"])
train_masks = torch.tensor(tokens_train["attention_mask"])

val_ids = torch.tensor(tokens_val["input_ids"])
val_masks = torch.tensor(tokens_val["attention_mask"])

test_ids = torch.tensor(tokens_test["input_ids"])
test_masks = torch.tensor(tokens_test["attention_mask"])

In [None]:
batch_size = 256

total_samples = len(train_ids)

all_embeddings = []

# Process the dataset in batches
with torch.no_grad():
    for start_idx in range(0, total_samples, batch_size):
        print(start_idx)
        end_idx = min(start_idx + batch_size, total_samples)
        batch_ids = train_ids[start_idx:end_idx]
        batch_masks = train_masks[start_idx:end_idx]

        # Get embeddings for the current batch
        batch_embeddings = model(batch_ids, attention_mask=batch_masks).last_hidden_state

        # Append batch embeddings to the list
        all_embeddings.append(batch_embeddings)

# Concatenate embeddings from all batches along the batch dimension
train_embeddings = torch.cat(all_embeddings, dim=0)

In [None]:
total_samples = len(val_ids)

all_embeddings = []

# Process the dataset in batches
with torch.no_grad():
    for start_idx in range(0, total_samples, batch_size):
        end_idx = min(start_idx + batch_size, total_samples)
        batch_ids = val_ids[start_idx:end_idx]
        batch_masks = val_masks[start_idx:end_idx]

        # Get embeddings for the current batch
        batch_embeddings = model(batch_ids, attention_mask=batch_masks).last_hidden_state

        # Append batch embeddings to the list
        all_embeddings.append(batch_embeddings)

# Concatenate embeddings from all batches along the batch dimension
val_embeddings = torch.cat(all_embeddings, dim=0)

In [None]:
total_samples = len(test_ids)

all_embeddings = []

# Process the dataset in batches
with torch.no_grad():
    for start_idx in range(0, total_samples, batch_size):
        end_idx = min(start_idx + batch_size, total_samples)
        batch_ids = test_ids[start_idx:end_idx]
        batch_masks = test_masks[start_idx:end_idx]

        # Get embeddings for the current batch
        batch_embeddings = model(batch_ids, attention_mask=batch_masks).last_hidden_state

        # Append batch embeddings to the list
        all_embeddings.append(batch_embeddings)

# Concatenate embeddings from all batches along the batch dimension
test_embeddings = torch.cat(all_embeddings, dim=0)

In [None]:
print(train_df_clean['string'])

In [None]:
print(train_text)

In [None]:
# for train set
train_text = torch.tensor(train_embeddings, dtype=torch.float32)
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_text = torch.tensor(val_embeddings, dtype=torch.float32)
val_y = torch.tensor(val_labels.tolist())

# for test set
test_text = torch.tensor(test_embeddings, dtype=torch.float32)
test_y = torch.tensor(test_labels.tolist())

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_text, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_text, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_classes = num_classes
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.attention = nn.Linear(hidden_size, 1)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out, _ = self.bilstm(x)

        attention_scores = self.attention(out).squeeze(-1)

        attention_weights = F.softmax(attention_scores, dim=1)

        weighted_sum = torch.sum(out * attention_weights.unsqueeze(-1), dim=1)

        out = self.fc(weighted_sum)
        return out

# Parameters
input_size = 768  # Input size
hidden_size = 128  # Number of neurons in each LSTM layer
num_layers = 1  # Number of LSTM layers
num_classes = 3  # Number of output classes

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

def train(model, train_loader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(train_loader)

def evaluate(model, val_loader, criterion):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return running_loss / len(val_loader), correct / total

def predict(model, test_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for inputs in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.cpu().numpy())
    return predictions

model = LSTM(input_size, hidden_size, num_layers, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 5

for epoch in range(num_epochs):
    train_loss = train(model, train_dataloader, criterion, optimizer)
    val_loss, val_acc = evaluate(model, val_dataloader, criterion)
    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

In [None]:
def predict(model, inputs):
    model.eval()
    with torch.no_grad():
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
    return predicted.cpu().numpy()

In [None]:
preds = predict(model, test_text)

In [None]:
# # model's performance
# preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

micro_f1_score = f1_score(test_y, preds, average='micro')
print(micro_f1_score)