In [None]:
## Installation of libraries
!pip install docx2txt
!pip install --upgrade nltk
!pip install --upgrade llama-index
!pip install llama-index-llms-replicate
!pip install datasets
!pip install sentence_transformers bert-score nltk rouge-score
!pip install llama-index-finetuning
!pip install cohere
!pip install llama-index-llms-cohere
!pip install llama-index-postprocessor-cohere-rerank

## Initialization of RAG

In [None]:
"""
Script for creating or loading a Vector Store Index using OpenAI's GPT models and LlamaIndex.
The index is optimized for querying large collections of legal documents.

Modules:
    os: For file and directory management.
    openai: OpenAI API client for GPT operations.
    nltk: Natural Language Toolkit for text preprocessing.
    llama_index.core: Tools for creating and querying Vector Store Indexes.
    IPython.display: Displays Markdown content in Jupyter Notebooks.

Usage:
    - Set up the working directory and API keys.
    - Load or create a Vector Store Index from legal documents.
    - Use the query engine to perform similarity-based searches on indexed data.
"""

from google.colab import drive
import os
from openai import OpenAI
import nltk
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from IPython.display import Markdown, display
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    load_index_from_storage,
    Settings
)

#### Uploading of documents to drive
# drive.mount('/content/drive')

# Set working directory and environment variables
desired_directory = '/content/drive/MyDrive'
os.chdir(desired_directory)

os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"  # OpenAI API Key
os.environ["REPLICATE_API_TOKEN"] = "YOUR_REPLICATE_TOKEN"  # Replicate API token (not used here)

# Download necessary NLTK resources
nltk.download('punkt')  # Sentence tokenizer
nltk.download('stopwords')  # Common stopwords

# Initialize OpenAI client
client = OpenAI()

# Configure LlamaIndex settings
Settings.chunk_size = 1024  # Maximum tokens per chunk
Settings.chunk_overlap = 25  # Overlap between chunks for continuity

# Define directory for persistent storage
PERSIST_DIR = "aila_indices_legislation"

# Check if an index already exists
if not os.path.exists(PERSIST_DIR):
    """
    If no index exists:
    - Generate a sample response using GPT.
    - Load documents from the `legislation` directory.
    - Create and persist a Vector Store Index.
    """
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a legal assistant, skilled in explaining complex legal concepts with simple words."},
            {"role": "user", "content": "Explain the concept of phishing."}
        ]
    )
    print(completion.choices[0].message)  # Print the chatbot response

    # Load documents and create the index
    documents = SimpleDirectoryReader("legislation").load_data()
    index = VectorStoreIndex.from_documents(documents)

    # Persist the index for future use
    index.storage_context.persist(persist_dir=PERSIST_DIR)
    print("Index created from scratch.")
else:
    """
    If the index exists:
    - Load it from the persistent storage directory.
    """
    print("Loading existing index...")
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)
    print("Index loaded.")

# Configure the query engine
query_engine = index.as_query_engine(similarity_top_k=8)
"""
The query engine is optimized to return the top 8 results based on similarity.
This is suitable for retrieving the most relevant documents or passages.
"""


Preprocessing of *True*-False questions

In [7]:
"""
Script for processing Excel data into a Hugging Face Dataset for True/False questions.
This script performs the following:
1. Reads data from an Excel file.
2. Cleans and organizes data into structured columns.
3. Saves the processed data as a JSON file.
4. Converts the JSON data into a Hugging Face Dataset.
5. Splits the dataset into training and testing subsets.

Modules:
    pandas: Data manipulation and analysis.
    json: Handles JSON file reading and writing.
    google.colab: Facilitates file upload in Google Colab.
    datasets: Provides tools for working with Hugging Face datasets.

Usage:
    - Ensure the Excel file "Teliko.xls" is in the working directory.
    - Run the script to generate and split a Hugging Face dataset.
"""

import pandas as pd
import json
from datasets import Dataset, concatenate_datasets, load_dataset

# Load the Excel file
df = pd.read_excel("Teliko.xls", sheet_name='True false questions')
"""
Reads the 'True false questions' sheet from the Excel file into a pandas DataFrame.
"""

# Define column names for the final DataFrame
columns = ['id', 'question', 'answer', 'metadata.justification']

# Initialize lists for processing
questions = [df.columns[1].replace('\n', ' ').replace('\n\n', '')]  # Clean column header for questions
true_false = []  # Stores answers for true/false questions
justification = []  # Stores justifications for answers
indexs = []  # Final list of processed records

# Iterate over the rows of the DataFrame to extract and organize data
for i in range(len(df)):
    if df.iloc[i, 1] == 'text':  # Extract True/False answers
        true_false.append(str(df.iloc[i, 2]).lower())
    elif df.iloc[i, 1] == 'justification':  # Extract justification data
        justification.append(df.iloc[i, 2])
    elif df.iloc[i, 1] == 'rank':  # Skip rank rows
        pass
    else:
        # Extract and clean question text
        questions.append(df.iloc[i, 1].replace('\n', ' ').replace('\n\n', ''))

# Construct records by combining questions, answers, and justifications
for i in range(len(questions)):
    indexs.append([i, questions[i], true_false[i], justification[i]])

# Convert the records into a pandas DataFrame
df = pd.DataFrame(indexs, columns=columns)

# Convert the DataFrame into JSON format
json_data = df.to_json(orient='records', indent=4)
with open('output_with_nested_metadata.json', 'w') as json_file:
    json_file.write(json_data)
"""
Saves the processed data into a JSON file named 'output_with_nested_metadata.json'.
"""

# Load the JSON data from the saved file
with open('output_with_nested_metadata.json', 'r') as json_file:
    json_data = json.load(json_file)
"""
Loads the JSON file into a Python object for further processing.
"""

# Convert the JSON data into a Hugging Face Dataset
dataset = Dataset.from_dict({
    "query": [item["question"] for item in json_data],
    "relevant_passages": [f'{item["answer"]}' for item in json_data]
})
"""
Creates a Hugging Face Dataset from the processed JSON data.
Each record consists of:
    - `query`: The question text.
    - `relevant_passages`: The corresponding true/false answer.
"""

# Split the dataset into training and testing subsets
split_dataset = dataset.train_test_split(test_size=0.3, seed=0)
train_dataset_tf = split_dataset['train']
test_dataset_tf = split_dataset['test']
"""
Splits the dataset into training (70%) and testing (30%) subsets using a fixed random seed for reproducibility.
"""

# Format the training dataset for further use
train_dataset_tf = [
    {"query": query, "relevant_passages": [relevant_passage]}
    for query, relevant_passage in zip(train_dataset_tf["query"], train_dataset_tf["relevant_passages"])
]
"""
Reformats the training dataset to ensure `relevant_passages` is stored as a list for each record.
"""



Preprocessing of Open-Ended questions

In [9]:
import pandas as pd
import json
from datasets import Dataset, concatenate_datasets, load_dataset

df = pd.read_excel("Teliko.xls",sheet_name='open ended questions')

columns = ['id','question','answer']
questions = [df.columns[1].replace('\n' ,' ').replace('\n\n','')]
answers = []
indexs = []

for i in range(len(df)):
    if df.iloc[i,0] == 'answer':
        answers.append(df.iloc[i,1].replace('\n',''))
    elif df.iloc[i,0] == 'question ':
        questions.append(df.iloc[i,1].replace('\n',''))

for i in range(len(questions)):
    indexs.append([i,questions[i],answers[i]])

df = pd.DataFrame(indexs,columns=columns)
df.to_excel('phishing_open_ended_questions_new_vfinal.xlsx')

json_data = df.to_json(orient='records', indent=4)

with open('output_with_nested_metadata.json', 'w') as json_file:
    json_file.write(json_data)


# Load the JSON data from file
with open('output_with_nested_metadata.json', 'r') as json_file:
    json_data = json.load(json_file) # load the file as a json object

dataset = Dataset.from_dict({
    "query": [item["question"] for item in json_data],
    "relevant_passages": [f'{item["answer"]}' for item in json_data]
})


split_dataset = dataset.train_test_split(test_size=0.3,seed=0)

train_dataset_oe = split_dataset['train']
test_dataset_oe = split_dataset['test']

train_dataset_oe = [
    {"query": query, "relevant_passages": [relevant_passage]}
    for query, relevant_passage in zip(train_dataset_oe["query"], train_dataset_oe["relevant_passages"] )
]


Preprocessing of Multiple-Choice questions

In [23]:
"""
Script for processing Excel data into a Hugging Face Dataset for Multiple-Choice Questions.
This script performs the following:
1. Reads data from an Excel file containing multiple-choice questions.
2. Cleans and organizes data into structured columns.
3. Extracts choices, ranks, and justifications for each question.
4. Generates JSON and Hugging Face datasets, handling nested metadata.
5. Splits the dataset into training and testing subsets.

Modules:
    pandas: For data manipulation and analysis.
    json: For handling JSON file reading and writing.
    datasets: Tools for creating and managing Hugging Face datasets.
    numpy: For numerical operations.
    math: For mathematical utilities.

Usage:
    - Place the Excel file "Teliko.xls" in the working directory.
    - Run the script to generate and split a Hugging Face dataset for multiple-choice questions.
"""

import json
from datasets import Dataset, concatenate_datasets, load_dataset
import numpy as np
import math
import pandas as pd

# Load the Excel file
df = pd.read_excel("Teliko.xls", sheet_name='multiple choice questions')
"""
Reads the 'multiple choice questions' sheet from the Excel file into a pandas DataFrame.
"""

# Define column names for the final DataFrame
columns = [
    'ID', 'question', 'Choice A', 'metadata A.rank', 'metadata A.justification',
    'Choice B', 'metadata B.rank', 'metadata B.justification',
    'Choice C', 'metadata C.rank', 'metadata C.justification',
    'Choice D', 'metadata D.rank', 'metadata D.justification'
]

# Initialize lists for processing
multiple_choices = []  # Stores processed multiple-choice questions
options = [["None", "None", "None", "None"]]
justifications = [["None", "None", "None", "None"]]
ranks = [["None", "None", "None", "None"]]
questions = [df.columns[1].replace('\n', '')]  # Extract question header
id = 0
current_option_id = 0

# Extract multiple-choice data from the DataFrame
for i in range(len(df)):
    if df.iloc[i, 1] in ["text", "text "]:  # Extract options
        options[id][current_option_id] = str(df.iloc[i, 2]).replace('\n', '')
    elif df.iloc[i, 1] == 'justification':  # Extract justifications
        justifications[id][current_option_id] = str(df.iloc[i, 2]).replace('\n', '')
    elif df.iloc[i, 1] in ['rank', 'rank ']:  # Extract ranks
        ranks[id][current_option_id] = df.iloc[i, 2]
        current_option_id += 1
    elif df.iloc[i, 0] == 'question':  # Process new question
        current_option_id = 0
        id += 1
        options.append(["None", "None", "None", "None"])
        justifications.append(["None", "None", "None", "None"])
        ranks.append(["None", "None", "None", "None"])
        questions.append(df.iloc[i, 1].replace('\n', ''))

# Combine extracted data into structured records
for i in range(len(questions)):
    multiple_choices.append([
        i, questions[i],
        options[i][0], ranks[i][0], justifications[i][0],
        options[i][1], ranks[i][1], justifications[i][1],
        options[i][2], ranks[i][2], justifications[i][2],
        options[i][3], ranks[i][3], justifications[i][3]
    ])

# Create a pandas DataFrame from the records
df = pd.DataFrame(multiple_choices, columns=columns)

# Convert the DataFrame to a dictionary for processing
dict_data = df.to_dict(orient='records')

# Process metadata and best answers for multiple-choice questions
choices = ['A', 'B', 'C', 'D']
for row in dict_data:
    options_num = 0
    row['question'] = row['question'].replace('\n', "")
    for choice in choices:
        if pd.notna(row[f'metadata {choice}.rank']) and row[f'metadata {choice}.rank'] != "None":
            if int(row[f'metadata {choice}.rank']) == 1:  # Identify the correct answer
                answer_justification = row[f'metadata {choice}.justification'].replace('\n', "").replace('\"', "'")
                answer_choice = row[f'Choice {choice}'].replace('\n', "").replace('\"', "'")
                answer = f"{choice}) {answer_choice}. {answer_justification}".replace('..', '.').replace('. . ', '.')
                row['best_answer'] = answer

# Save processed data to JSON
json_data = json.dumps(dict_data, indent=4)
with open('output_with_nested_metadata.json', 'w') as json_file:
    json_file.write(json_data)

# Load the JSON data into a Python object
with open('output_with_nested_metadata.json', 'r') as json_file:
    json_data = json.load(json_file)

# Create a Hugging Face Dataset from the JSON data
dataset = Dataset.from_dict({
    "query": [item["question"] for item in json_data],
    "relevant_passages": [item['best_answer'] for item in json_data],
    "Choice_A": [str(item['Choice A']) for item in json_data],
    "Choice_B": [str(item['Choice B']) for item in json_data],
    "Choice_C": [str(item['Choice C']) for item in json_data],
    "Choice_D": [str(item['Choice D']) for item in json_data],
})
"""
Converts the JSON data into a Hugging Face Dataset.
"""

# Split the dataset into training and testing subsets
split_dataset = dataset.train_test_split(test_size=0.3, seed=0)
train_dataset_mc = split_dataset['train']
test_dataset_mc = split_dataset['test']
"""
Splits the dataset into training (70%) and testing (30%) subsets using a fixed random seed for reproducibility.
"""

# Prepare the training dataset for further processing
questions = []
options = []
for item in train_dataset_mc:
    for row in dict_data:
        if row['question'] == item['query']:
            questions_row = []
            options_row = []
            for choice in choices:
                if pd.notna(row[f'metadata {choice}.rank']) and row[f'metadata {choice}.rank'] != "None":
                    answer_justification = row[f'metadata {choice}.justification'].replace('\n', "").replace('\"', "'")
                    answer_choice = row[f'Choice {choice}'].replace('\n', "").replace('\"', "'")
                    answer = f"{choice}) {answer_choice}. {answer_justification}".replace('..', '.').replace('. . ', '.')
                    question = row['question'].replace('\n', "")
                    if int(row[f'metadata {choice}.rank']) == 1:
                        questions_row.append(question)
                        options_row.append(answer)
                    else:
                        questions_row.append(question + f"(Not correct option {int(row[f'metadata {choice}.rank'])})")
                        options_row.append(answer + f"(Not correct option {int(row[f'metadata {choice}.rank'])})")
            questions.append(questions_row)
            options.append(options_row)

# Structure the data into a new dictionary for saving
dict_data = []
for i in range(len(questions)):
    for j in range(len(questions[i])):
        dict_data.append({"question": questions[i][j], "option": options[i][j]})

# Save the structured data back to JSON
json_data = json.dumps(dict_data, indent=4)
with open('output_with_nested_metadata.json', 'w') as json_file:
    json_file.write(json_data)

# Reload the JSON data for creating a Hugging Face Dataset
with open('output_with_nested_metadata.json', 'r') as json_file:
    json_data = json.load(json_file)

dataset = Dataset.from_dict({
    "query": [item["question"] for item in json_data],
    "relevant_passages": [item['option'] for item in json_data],
})
"""
Creates a new Hugging Face Dataset with questions and options for training.
"""

# Prepare the final training dataset
train_dataset_mc = [
    {"query": query, "relevant_passages": [relevant_passage]}
    for query, relevant_passage in zip(dataset["query"], dataset["relevant_passages"])
]


Finetuning of RAG model (only run it for finetuning)



In [None]:
"""
Script for fine-tuning a Cohere model for reranking and integrating it with LlamaIndex.
This script performs the following:
1. Loads a dataset for reranking tasks.
2. Creates and fine-tunes a Cohere model on the dataset.
3. Retrieves the fine-tuned model for reranking queries.
4. Integrates the fine-tuned Cohere model with a LlamaIndex query engine.

Modules:
    cohere: Python client for the Cohere API.
    llama_index: Provides tools for indexing and querying with advanced models.
    os: For environment and file handling.

Usage:
    - Set the `COHERE_API_KEY` to your valid Cohere API key.
    - Ensure the reranking dataset (`data.json`) is available in the specified path.
    - Run the script to fine-tune the model and integrate it with LlamaIndex.
"""

import cohere
from cohere.finetuning import FinetunedModel, Settings, BaseModel
import os
from llama_index.llms.cohere import Cohere
from llama_index.postprocessor.cohere_rerank import CohereRerank

# Set up Cohere API key
COHERE_API_KEY = "YOUR API KEY"  # Replace with your Cohere API key
co = cohere.Client(COHERE_API_KEY)
"""
Initializes the Cohere client using the provided API key.
"""

# Step 1: Create a dataset for reranking fine-tuning
rerank_dataset = co.datasets.create(
    name="rerank-dataset",
    data=open("./data.json", "rb"),  # Load the dataset from the specified file
    type="reranker-finetune-input"  # Specify dataset type for reranker fine-tuning
)
"""
Creates a dataset on the Cohere platform for fine-tuning a reranking model.
The dataset file (`data.json`) must be in the expected format for reranker fine-tuning.
"""

# Wait for the dataset to be processed
print(co.wait(rerank_dataset))

# Step 2: Fine-tune a reranker model using the dataset
finetune = co.finetuning.create_finetuned_model(
    request=FinetunedModel(
        name="reranked_model",  # Name for the fine-tuned model
        settings=Settings(
            base_model=BaseModel(
                name="english",  # Base language model
                base_type="BASE_TYPE_RERANK",  # Specify reranking as the fine-tuning objective
            ),
            dataset_id=rerank_dataset.id,  # ID of the dataset created above
        ),
    )
)
"""
Creates and starts the fine-tuning process for a reranker model on the specified dataset.
"""

# Step 3: Retrieve the fine-tuned model
ft = co.finetuning.get_finetuned_model("finetune-model-id")
"""
Retrieves the fine-tuned model using its unique model ID.
Replace `"finetune-model-id"` with the actual ID of the model created.
"""

# Step 4: Set up Cohere reranker with the fine-tuned model
cohere_rerank = CohereRerank(
    api_key=COHERE_API_KEY,
    model=ft.finetuned_model.id + '-ft'  # Use the fine-tuned model ID
)
"""
Initializes the Cohere reranker postprocessor using the fine-tuned model.
"""

# Step 5: Integrate the reranker with a LlamaIndex query engine
query_engine = index.as_query_engine(node_postprocessors=[cohere_rerank])
"""
Integrates the Cohere reranker into the LlamaIndex query engine.
This setup applies reranking postprocessing to the query results.
"""
