In [None]:
#This is the notebook to run the ASC pipeline proposed in the the thesis report
#The base architecture and many functions are adapted from the following tutorials:
#https://www.youtube.com/watch?v=c_nCjlSB1Zk
#https://www.youtube.com/watch?v=ogEalPMUCSY


Load Dependencies

In [None]:
!pip install langchain
!pip install InstructorEmbedding
!pip install faiss-cpu
!pip install sentence_transformers
!pip install bitsandbytes
!pip install bert-score
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
!pip install torchmetrics
!pip install accelerate
!pip install peft

Import Packages

In [None]:
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings
import pickle
import faiss
from langchain.vectorstores import FAISS

import csv
import os
import time
import bitsandbytes as bnb
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
import torch
import bert_score
from bert_score import score
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from torchmetrics.text import TranslationEditRate
import pandas as pd
from pandas import DataFrame
import accelerate
import peft
from peft import (PeftConfig,PeftModel)
import nltk
from torchmetrics.text import TranslationEditRate

Load basemodel

In [None]:
#skip this if using the finetuned model
#the llamacpp module is used to load the plain 'orca-2-7b' model because it provides efficient and fast inference

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llm = LlamaCpp(
    model_path='model_path',
    temperature=0.5,
    max_tokens=512,
    top_p=1,
    n_gpu_layers=60,
    n_batch=512,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

 Load finetuned model

In [None]:
#skip this if using the basemodel
#the finetuned model cannot be loaded through llamacpp because it does not support the file type, the models have to be in gguf format.
#the finetuned model could be formatted into gguf but it is a timely and intensive process and functions to load
#the adapter model directly are provided by huggingface as done below.


PEFT_MODEL =  '/content/drive/MyDrive/thesis data/orca 2 7b fintuned 1000 chunk' #provide path of the saved finetuned model from the finetuning notebook

#create bnb config to load model in 4bit quantization to reduce computational load
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

#load base model
config = PeftConfig.from_pretrained(PEFT_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path, #fetches the base model from the config file of the adapter model provided before, which is 'orca-2-7b' in this case
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
#load tokenizer for pipeline
tokenizer=AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

#combine base orca model with the adapter layers  from fine-tuning
model = PeftModel.from_pretrained(model, PEFT_MODEL)

#create pipeline for direct inference usage
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=100
)
llm = HuggingFacePipeline(pipeline=pipe)

Preparation of British Academic Written English Corpus (BAWE)

In [None]:
#manually created folder from the "CORPUS_ByDisc" consisting of all xml files of the disciplines
#Linguistics, Biological Sciences, Computer Science, Cybernetics Electronic Engineering, Engineering, Mathematics, Physics, Planning
#because these will be the relevant disciplines for this study(to aid in the process with CSAI lectures)


#The British Academic Written English corpus (BAWE) is used as a knowledge base for this pipeline
#to reduce computational load the relevant files will be filtered out of it
#the relevant files are all files from the disciplines: Linguistics, Biological Sciences, Computer Science, Cybernetics Electronic Engineering, Engineering, Mathematics, Physics, Planning
#At first a folder called 'RELEVANT_CORPUS_XML' has to be created manually
#this folder should contain all files from the aforementioned relevant disciplines, which can be easily constructed by using the structured folder 'CORPUS_ByDisc' after downloading the BAWE

directory = "RELEVANT_CORPUS_XML/" #provide path to the manually created 'RELEVANT_CORPUS_XML' folder

#create list with relevant file names to filter the CORPUS_TXT which is not structured by discipline
relevantfilenames =[]

#iterate through all filenames and filter out the identification code
for filename in os.listdir(directory):
    with open(os.path.join(directory,filename), encoding="utf8") as f:
        relevantfilenames.append(f"{filename[-9:-4]}") #retrieves the identification code of a file which is always at the end of the filename




#convert data(txt files of BAWE) into documents list

#define text_splitter to chunk the BAWE into bits of predefined size
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500, #set to 500 characters due to context length of 512 token
    chunk_overlap  = 20, #some overlap to preserve more coherence
    length_function = len,
    is_separator_regex = False,
)
#provide directory to "CORPUS_TXT" folder of BAWE Corpus
directory = "CORPUS_TXT/"

docs = [] #list which will hold all chunks from the BAWE

#iterate over the txt folder and use only relevant files by comparing the identification numbers
#then split the files into chunks and append them to the docs list
for filename in os.listdir(directory):
    if filename[:-4] in relevantfilenames: #compare identification codes of files to only filter out relevant files
        with open(os.path.join(directory,filename), encoding="utf8") as f:
            print(f"{filename} works")
            curtext = f.read()
            curdoc = text_splitter.create_documents([curtext])
            for doc in curdoc:
                docs.append(doc)


In [None]:
#store chunked knowledge base txts from BAWE
with open("WHOLE_RELEVANT_CORPUS_TXT", "wb") as fp:   #Pickling
    pickle.dump(docs, fp)

In [None]:
#retrieve Corpus txt list
with open("/content/drive/MyDrive/thesis data/WHOLE_RELEVANT_CORPUS_TXT", "rb") as fp:   # Unpickling
    documents = pickle.load(fp)

Vectorization of knowledge base


In [None]:
#to turn the chunks of BAWE into dense vector embeddings, an instructor model has to be loaded which does that
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", #this is an opensource instructor model which yields good performance
                                                      model_kwargs={"device": "cuda"})

In [None]:
#indicate the store path where the embeddings should be stored
Embedding_store_path = "/content/drive/MyDrive/thesis data/Embedding_store"

In [None]:
#define function which calculates embeddings and stores them
def store_embeddings(docs, embeddings, store_name, path):

    vectorStore = FAISS.from_documents(docs, embeddings) #Meta's embedding module provides this function for easy construction of embeddings

    with open(f"{path}/faiss_{store_name}.pkl", "wb") as f: #embeddings are stored in a vector store to load them without having to recalculate them every time
        pickle.dump(vectorStore, f)

In [None]:
#this function loads the vector store into memory
def load_embeddings(store_name, path):
    with open(f"{path}/faiss_{store_name}.pkl", "rb") as f:
        VectorStore = pickle.load(f)
    return VectorStore

In [None]:
#executes the creation of the embeddings
start = time.time()

store_embeddings(documents,
                 instructor_embeddings,
                 store_name='instructEmbeddingsRELEVANT', #provide name for embedding file
                 path=Embedding_store_path)

end = time.time()
print(end - start)

In [None]:
#use function to actually load embeddings
db_instructEmbedd = load_embeddings(store_name='instructEmbeddingsRELEVANT', #provide correct storing name
                                    path=Embedding_store_path)

Similarity search in knowledge base

In [None]:
#build a retriever on the knowledge base to retrieve similar context chunks for the prompt later
retriever = db_instructEmbedd.as_retriever(search_kwargs={"k": 1}) #define the number of chunks the retriever should output, in this case it is set to 1 due to the maximum context length

In [None]:
#define retrieve function to extract similar text passages from BAWE to have additional information
def retrieve_context(query):
  context_info = retriever.get_relevant_documents(query)
  #extract page content of doc to delete unnecessary info from the string(source/row etc)
  page_contents_array = [doc.page_content for doc in context_info]
  return page_contents_array

Set up LLM to the task

In [None]:
#define prompt template which will be filled with the subtitle sequence and the context chunk
template = """
You are tasked with correcting subtitles, which where automatically generated and
therefore incorporate false transcriptions. Especially technical terms are often
incorrectly transcribed. Analyse the sentences and distill the incorrect words out of the sentences and replace
them with the correct terms. Do not make more changes.

Below is the subtitle passage you should correct now:
{subtitles}

Below here is some context information to understand the context of the false transcriptions better:
{context_info}

Please output ONLY the corrected subtitle passage
"""
prompt = PromptTemplate(
    input_variables=["subtitles", "context_info"],
    template=template
)
#chain functionality together to infer a forward pass in the llm with the appropriate prompt
chain = LLMChain(llm=llm, prompt=prompt)

In [None]:
#define function which takes in a subtitle sequence as argument
def generate_output(subtitles):
    context_info = retrieve_context(subtitles)
    output = chain.run(subtitles=subtitles, context_info=context_info)
    return output

Load Data

In [None]:
#load subtitle txts, use readlines to split into equal chunks that are equivalent in their position in the lecture
#strip lines of newline character and drop the odd number lines because they are only timestamps and dilute the statistics because they are always the same

with open("/content/drive/MyDrive/thesis data/Subtitles/RNN_1 Panopto - DL.txt") as f:
  RNN_1_Panopto_NO_TS = []
  for i, line in enumerate(f):
    if i % 2 == 0:
      RNN_1_Panopto_NO_TS.append(line.rstrip())

with open("/content/drive/MyDrive/thesis data/Subtitles/RNN_1 Goldstandard - DL.txt") as f:
  RNN_1_Goldstandard_NO_TS = []
  for i, line in enumerate(f):
    if i % 2 == 0:
      RNN_1_Goldstandard_NO_TS.append(line.rstrip())

with open("/content/drive/MyDrive/thesis data/Subtitles/RNN_2 Panopto - DL.txt") as f:
  RNN_2_Panopto_NO_TS = []
  for i, line in enumerate(f):
    if i % 2 == 0:
      RNN_2_Panopto_NO_TS.append(line.rstrip())

with open("/content/drive/MyDrive/thesis data/Subtitles/RNN_2 Goldstandard - DL.txt") as f:
  RNN_2_Goldstandard_NO_TS = []
  for i, line in enumerate(f):
    if i % 2 == 0:
      RNN_2_Goldstandard_NO_TS.append(line.rstrip())

Iterate through transcriptions and execute correction task

In [None]:
#work through data
subtitlecorpus = RNN_1_Panopto_NO_TS #load respective corpus version WITHOUT timestamps
RNN_1_ASC = [] #give appropriate name for output
length = int(len(subtitlecorpus)/2) #define the length based on how many lines of the transcription should be processed at once


for l in range(1, length + 1):
  subtitles = ' '.join(subtitlecorpus[(l*2)-2:l*2])  # we will take 2 lines of text into one chunk due to the context length
  ascoutput = generate_output(subtitles) # runs the correction on the provided subtitle sequence
  RNN_1_ASC.append(ascoutput) #append LLM output to the outcome list
  print(f"{l}/{length} processed...") #tracks progress



Evaluation metrics

In [None]:
#use this to load the transcriptions created by the pipeline if they are not in memory(when revisiting the notebook for example and the calculations dont have to be made again)

with open("/content/drive/MyDrive/thesis data/Subtitles/RNN_2_ASC_finetuned", "rb") as fp:   #Pickling
  RNN_2_ASC_finetuned = pickle.load(fp)

with open("/content/drive/MyDrive/thesis data/Subtitles/RNN_1_ASC_basemodel", "rb") as fp:   #Pickling
  RNN_1_ASC_basemodel = pickle.load(fp)

with open("/content/drive/MyDrive/thesis data/Subtitles/RNN_2_ASC_basemodel", "rb") as fp:   #Pickling
  RNN_2_ASC_basemodel = pickle.load(fp)

with open("/content/drive/MyDrive/thesis data/Subtitles/RNN_1_ASC_finetuned", "rb") as fp:   #Pickling
  RNN_1_ASC_finetuned = pickle.load(fp)

In [None]:
#create goldstandard 2 line version for comparison with asc outcome because the metrics compare every item from both lists and
#since the LLM worked on 2 lines of the subtitles, the outcome has 2 lines per item
subtitlecorpus = RNN_1_Goldstandard_NO_TS #load corpus version WITHOUT timestamps
RNN_1_Goldstandard_2_line = [] #give appropriate name for output
length = int(len(subtitlecorpus)/2)


for l in range(1, length + 1): # we will take 2 lines of text into one chunk
  subtitles = ' '.join(subtitlecorpus[(l*2)-2:l*2])
  RNN_1_Goldstandard_2_line.append(subtitles)


subtitlecorpus = RNN_2_Goldstandard_NO_TS #load corpus version WITHOUT timestamps
RNN_2_Goldstandard_2_line = [] #give appropriate name for output
length = int(len(subtitlecorpus)/2)


for l in range(1, length + 1): # we will take 2 lines of text into one chunk
  subtitles = ' '.join(subtitlecorpus[(l*2)-2:l*2])
  RNN_2_Goldstandard_2_line.append(subtitles)

In [None]:
#BLEU
#calculates how similar two sentences are on a word for word basis
BLEUscore = nltk.translate.bleu_score.sentence_bleu([RNN_1_Goldstandard_2_line], RNN_1_ASC_finetuned) #insert the corresponding data for comparison,
#the 2 line version of the goldstandard is needed for comparison with the outputs of the pipeline because those are also incorporating 2 lines per list item
print(BLEUscore)

0


In [None]:
#TER
#calculates the edit distance between two strings
ter = TranslationEditRate()
ter(RNN_1_Goldstandard_2_line, RNN_1_ASC_finetuned) #insert the corresponding data for comparison

tensor(0.9240)

In [None]:
#BERTSCORE
#calculates how similar two seqeunces are on a semantic basis by making use of the embeddings created by BERT

P, R, F1 = score(RNN_1_Goldstandard_2_line, RNN_1_ASC_finetuned, lang="en", verbose=True)
#score function gets provided a list of candidate and reference strings
#and does the rest itself
#print the averages for all scores
print(sum(F1)/len(F1))
print(sum(P)/len(P))
print(sum(R)/len(R))