In [8]:
import os
import sys
import numpy as np

import utils
import config
import pandas as pd
from tqdm import tqdm
from rouge_score import rouge_scorer

from llm import llm_wrapper
from embedding import embedder
from chain import rag_chain

import torch
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline

from matplotlib import pyplot as plt

In [2]:
import importlib
importlib.reload(utils)
importlib.reload(config)
importlib.reload(llm_wrapper)
importlib.reload(embedder)
importlib.reload(rag_chain)

import warnings
warnings.filterwarnings('ignore')

In [3]:
## Initialization of sub-components
my_device = "cuda" if torch.cuda.is_available() else "cpu"

## Dataset Loader
embedder_name = config.EMBED_MODEL
data_dir = config.DATA_DIR
documents = config.DOCUMENTS

# Embedder class, which is responsible to init, create, load dataset
my_embedder = embedder.Embedder(embedder_name, data_dir, documents, chunk_length=512, overlap=32, save_vec_db=True)

doc_index = 0 # the index of the document that we want to do RAG
loaded_doc = my_embedder.pdf_data_loader(doc_index)
cleaned_docs = utils.clean_business_conduct_policy(loaded_doc, n_remove_first_lines=3, n_discard_pages=[1, 2, 20])
chunks = my_embedder.text_splitter(cleaned_docs)
vector_db, retriever = my_embedder.create_vector_database(chunks, top_k_doc=5)

## LLM Loader
base_model_id = config.LLM_MODEL_ID
my_llm_wrapper = llm_wrapper.SmolLLMWrapper(base_model_id, max_length=256, 
                                            temperature=0.3, top_p=0.9, top_k=50, repetition_penalty=1.2, do_sample=True, truncation=True)
llm, llm_tokenizer = my_llm_wrapper.get_llm()

pipe = pipeline("text-generation", model=llm, tokenizer=llm_tokenizer, max_new_tokens=256,
                 temperature=0.3, top_p=0.9, top_k=50, repetition_penalty=1.2, do_sample=True, truncation=True)
llm_hg_pipeline = HuggingFacePipeline(pipeline=pipe)

## RAG Chain Loader
rag_chain_train = rag_chain.RAGChainBuilder(llm_hg_pipeline, retriever)
qa_bot_chain = rag_chain_train.build_chain()

Initializing Embedding Model
--------------------
Initializing LLM Model
--------------------


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initializing Retrieval Chain
--------------------


In [4]:
# Sample Test Question with RAG - LangChain Pipeline
question = "What should an employee do if they suspect a policy violation?"

# Perform a query using the QA bot chain
response = qa_bot_chain.run(question)
cleaned_response = utils.clean_gemma_chain_response(response, keyword='### Answer:')

print(f"{response}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



        ### Context:
        •  Speak up. If you see or hear of any violation of Apple’s Business Conduct Policy, other Apple policies, or legal or 
regulatory requirements, you must notify either your manager, People Team, Legal, or Business Conduct. 
•  Use good judgment and ask questions. Apply Apple’s principles of business conduct, and review our policies and legal 
requirements. When in doubt about how to proceed, discuss it with your manager, your People Business Partner, Legal, or 
Business Conduct.

•  Reported or participated in the investigation of a potential violation of our policies or the law; or
• Engaged in legally protected activity, including related to leaves of absence or job accommodations, or forming or joining 
(or refraining from joining) labor organizations of an employee’s choice in a lawful manner.

Everything we do is a reflection of Apple. We expect you to:
• Follow the Policy and exhibit appropriate workplace behavior. Comply with the letter and spirit o

In [5]:
## Read <Question-Answer> Pair
qa_doc_path = os.path.join(config.DATA_DIR, "apple_business_conduct_questions-answers_bilingual.csv")
qa_df = pd.read_csv(qa_doc_path)

qa_df.head(3)

Unnamed: 0,English,Turkish,English Answer,Turkish Answer
0,What are the core principles of Apple's Busine...,Apple'ın İş Ahlakı Politikasının temel ilkeler...,Apple's core principles are: Honesty - demonst...,Apple'ın temel ilkeleri şunlardır: Dürüstlük -...
1,What ethical behavior does Apple expect from i...,"Apple, çalışanlarından hangi etik davranışları...",Apple expects employees to follow the Policy a...,Apple çalışanlarından Politikaya uymalarını ve...
2,What responsibilities do employees have regard...,Çalışanların yasalara ve düzenlemelere uyum ko...,Employees must comply with Apple's Business Co...,Çalışanlar Apple'ın İş Ahlakı Politikası'na ve...


In [6]:
finalized_en_questions = []
finalized_en_gt_responses = []
finalized_en_chain_responses = []
finalized_en_raw_responses = []


for i in tqdm(range(len(qa_df))):
    eng_question = qa_df.iloc[i, 0]
    eng_answer = qa_df.iloc[i, 2]

    tr_question = qa_df.iloc[i, 1]
    tr_answer = qa_df.iloc[i, 3]

    ## Get an answer using RAG - LangChain Pipeline
    en_response_chain = qa_bot_chain.run(eng_question)
    en_cleaned_response_chain = utils.clean_gemma_chain_response(response, keyword='### Answer:')

    ## Get an answer using raw LLM model
    en_llm_templaate_raw = my_llm_wrapper.llm_template(eng_question, include_system=False)
    en_llm_output_raw = my_llm_wrapper.llm_generate_output(en_llm_templaate_raw)
    en_cleaned_response_raw = utils.clean_gemma_response(en_llm_output_raw, question)

    finalized_en_questions.append(eng_question)
    finalized_en_gt_responses.append(eng_answer)
    finalized_en_chain_responses.append(en_cleaned_response_chain)
    finalized_en_raw_responses.append(en_cleaned_response_raw)

 56%|█████▋    | 9/16 [00:56<00:44,  6.42s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 16/16 [02:03<00:00,  7.75s/it]


In [None]:
def calculate_rouge(prediction: str, reference: str):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, prediction)
    return scores

def batch_calculate_rouge(predictions: list, references: list):
    assert len(predictions) == len(references), "Predictions and references must be the same length."

    rouge1_p, rouge1_r, rouge1_f = [], [], []
    rouge2_p, rouge2_r, rouge2_f = [], [], []
    rougel_p, rougel_r, rougel_f = [], [], []

    for pred, ref in zip(predictions, references):
        scores = calculate_rouge(pred=pred, reference=ref)

        rouge1_p.append(scores['rouge1'].precision)
        rouge1_r.append(scores['rouge1'].recall)
        rouge1_f.append(scores['rouge1'].fmeasure)

        rouge2_p.append(scores['rouge2'].precision)
        rouge2_r.append(scores['rouge2'].recall)
        rouge2_f.append(scores['rouge2'].fmeasure)

        rougel_p.append(scores['rougeL'].precision)
        rougel_r.append(scores['rougeL'].recall)
        rougel_f.append(scores['rougeL'].fmeasure)

    results = {
        "ROUGE-1": {
            "Precision": np.mean(rouge1_p),
            "Recall": np.mean(rouge1_r),
            "F1": np.mean(rouge1_f)
        },
        "ROUGE-2": {
            "Precision": np.mean(rouge2_p),
            "Recall": np.mean(rouge2_r),
            "F1": np.mean(rouge2_f)
        },
        "ROUGE-L": {
            "Precision": np.mean(rougel_p),
            "Recall": np.mean(rougel_r),
            "F1": np.mean(rougel_f)
        }
    }

    return results



for gt_response, chain_response, raw_response in zip(finalized_en_gt_responses, finalized_en_chain_responses, finalized_en_raw_responses):

    chain_score = calculate_rouge(chain_response, gt_response)
    raw_score = calculate_rouge(raw_response, gt_response)