<a href="https://github.com/Finboost/finboost-ml" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install dependencies

In [None]:
!pip install transformers datasets torch faiss-cpu pandas

# Step 1: Fine-Tuning

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForQuestionAnswering, Trainer, TrainingArguments, default_data_collator

## Load the dataset

In [None]:
# df = pd.read_csv('/content/data/dataset.csv')
df = pd.read_csv('../data/dataset.csv')
dataset = Dataset.from_pandas(df)

## Load the tokenizer and model

In [None]:
model_name = "deepset/roberta-base-squad2"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForQuestionAnswering.from_pretrained(model_name)

## Tokenize dataset

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answer"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = examples["answer_start"][i]
        end_char = start_char + len(answer)

        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_idx = context_start
            while start_idx <= context_end and offset[start_idx][0] <= start_char:
                start_idx += 1
            start_positions.append(start_idx - 1)

            end_idx = context_start
            while end_idx <= context_end and offset[end_idx][1] < end_char:
                end_idx += 1
            end_positions.append(end_idx - 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

## Define training arguments

In [None]:
training_args = TrainingArguments(
    # output_dir="/content/models/fine_tuned_model",
    output_dir="/models/roberta/fine_tuned_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

## Initialize Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    data_collator=default_data_collator,
)

## Fine-tune the model

In [None]:
trainer.train()

## Save the model

In [None]:
trainer.save_model("/models/fine_tuned_model")

# Step 2: Using RAG (Retrieval-Augmented Generation)

In [None]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration

## Load the tokenizer, retriever, and model

In [None]:
rag_model_name = "facebook/rag-sequence-base"
rag_tokenizer = RagTokenizer.from_pretrained(rag_model_name)
retriever = RagRetriever.from_pretrained(rag_model_name, index_name="custom", passages_path="/content/data/dataset.csv")
rag_model = RagSequenceForGeneration.from_pretrained(rag_model_name)

def generate_rag_response(question):
    inputs = rag_tokenizer(question, return_tensors="pt")
    generated = rag_model.generate(input_ids=inputs["input_ids"], decoder_start_token_id=rag_tokenizer.pad_token_id)
    return rag_tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

## Example usage

In [None]:
question = "Apa itu investasi saham?"
answer = generate_rag_response(question)
print(f"Q: {question}\nA: {answer}")

# Step 3: Integration

In [None]:
import torch

def generate_combined_response(question, context):
    # Step 1: Use the fine-tuned model for initial answer extraction
    inputs = tokenizer(question, context, return_tensors="pt")
    outputs = model(**inputs)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    initial_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs.input_ids[0][answer_start:answer_end]))

    # Step 2: Use RAG for enriched answer generation
    rag_question = f"{question} {initial_answer}"
    rag_answer = generate_rag_response(rag_question)
    
    return rag_answer

# Example usage

In [None]:
context = "Investasi saham adalah pembelian sebagian kecil kepemilikan di sebuah perusahaan yang diperdagangkan secara publik."
question = "Apa itu investasi saham?"
answer = generate_combined_response(question, context)
print(f"Q: {question}\nA: {answer}")
