# **Development of a Chatbot using LangChain with Predefined PDFs**
**Course Title: Artificial Intelligence**

 **Group Members;**

> **Amanda Ofori - 10201100146**

> **Yineteili Abii - 10201100115**

> **Kenneth Tetteh - 10201100073**

**Link to Git Repo: https://github.com/Amanda-Ofori/Langchain_Chatbot.git**

In [None]:
import os
import json
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import glob
import chardet  # Import chardet for encoding detection
import chardet
import sys


### PDFDocumentProcessor Class and Auxiliary Function




In [None]:

class PDFDocumentProcessor:
    def __init__(self, directory, storage_directory):
        self.directory = directory
        self.storage_directory = storage_directory
        self.text_splitter = RecursiveCharacterTextSplitter()
        self.ensure_directory_exists(self.storage_directory)

    def ensure_directory_exists(self, directory):
        """Ensure the storage directory exists."""
        if not os.path.exists(directory):
            os.makedirs(directory)

    def extract_text(self):
        """Extract text from PDF files in the directory and store it."""
        for filename in os.listdir(self.directory):
            if filename.endswith('.pdf'):
                path = os.path.join(self.directory, filename)
                with fitz.open(path) as doc:
                    text = " ".join(page.get_text() for page in doc)
                tokenized_text = self.text_splitter.split_text(text)
                self.store_document(tokenized_text, filename)

    def store_document(self, document, filename):
        """Store the tokenized document in a text file using UTF-8 encoding."""
        file_path = os.path.join(self.storage_directory, filename.replace('.pdf', '.txt'))
        with open(file_path, 'w', encoding='utf-8') as file:
            for item in document:
                file.write("%s\n" % item)



def read_file_safely(file_path):
    # Read the file as binary data for detection
    with open(file_path, 'rb') as file:
        raw_data = file.read()

    # Detect the encoding
    detected = chardet.detect(raw_data)
    encoding = detected['encoding'] if detected['encoding'] is not None else 'utf-8'

    try:
        # Attempt to read with detected encoding
        return raw_data.decode(encoding)
    except UnicodeDecodeError:
        # Fallback to UTF-8 and ignore errors
        return raw_data.decode('utf-8', errors='ignore')

### Explanation of the CustomRAGProcessor Class and Its Use



In [None]:
class CustomRAGProcessor:
    def __init__(self, model_name, storage_directory):
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.storage_directory = storage_directory

    def detect_encoding(self, file_path):
        with open(file_path, 'rb') as file:
            raw_data = file.read()
        result = chardet.detect(raw_data)
        return result['encoding']

    def retrieve_documents(self, query):
        documents = []
        for file_path in glob.glob(os.path.join(self.storage_directory, '*.txt')):
            document = read_file_safely(file_path)
            if query.lower() in document.lower():
                documents.append(document)
        return documents

    def generate_answer(self, query, max_length=100):
        documents = self.retrieve_documents(query)
        combined_context = " ".join(documents)
        # Truncate context to fit within model's maximum input size if necessary
        max_context_length = max_length - len(query) - 50  # reserve space for query and extra tokens
        if len(combined_context) > max_context_length:
            combined_context = combined_context[:max_context_length]

        input_text = f"Context: {combined_context} Question: {query}"
        input_ids = self.tokenizer.encode(input_text, return_tensors="pt", max_length=max_length, truncation=True)

        # Adjust generation parameters for more coherent generation
        output_ids = self.model.generate(
            input_ids,
            max_length=max_length,
            max_new_tokens=150,  # adjust as needed
            num_return_sequences=1,
            top_p=0.92,  # nucleus sampling, less randomness
            top_k=50,  # top-k sampling
            temperature=0.7  # lower temperature makes outputs less random
        )
        return self.tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Example usage
processor = PDFDocumentProcessor('Documents', 'path/to/storage')
processor.extract_text()

rag_processor = CustomRAGProcessor('gpt2', 'path/to/storage')
answer = rag_processor.generate_answer("What is the topic?")
print(answer)

