In [2]:
import os
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import torch

# Step 1: Preprocess the PDFs
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

def extract_texts_from_folder(folder_path):
    texts = ""
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                text = extract_text_from_pdf(pdf_path)
                if text:
                    texts += text + "\n"
    return texts

# Step 2: Build a Knowledge Base
def create_embeddings(text, model):
    sentences = [sentence for sentence in text.split('\n') if sentence.strip()]
    if not sentences:
        print("No valid sentences extracted.")
    embeddings = model.encode(sentences)
    return sentences, embeddings

# Step 3: Implement the Chatbot
class Chatbot:
    def __init__(self, model_name='gpt2'):
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        
        # Add padding token
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def generate_response(self, question):
        inputs = self.tokenizer.encode(question, return_tensors='pt')
        outputs = self.model.generate(inputs, max_length=100, num_return_sequences=1)
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response

    def save_model(self, save_path):
        self.model.save_pretrained(save_path)
        self.tokenizer.save_pretrained(save_path)

    def load_model(self, load_path):
        self.model = GPT2LMHeadModel.from_pretrained(load_path)
        self.tokenizer = GPT2Tokenizer.from_pretrained(load_path)
        
        # Add padding token
        self.tokenizer.pad_token = self.tokenizer.eos_token

# Step 4: Train and Fine-tune the Model
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = [text for text in texts if text.strip()]  # Ensure no empty strings
        if not self.texts:
            print("Dataset is empty after filtering.")
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        try:
            item = self.tokenizer(self.texts[idx], return_tensors='pt', max_length=512, truncation=True, padding='max_length')
            item = {key: val.squeeze(0) for key, val in item.items()}  # Remove batch dimension
            item['labels'] = item['input_ids']
            return item
        except IndexError:
            print(f"Index {idx} out of range for dataset with length {len(self.texts)}")
            raise

def fine_tune_model(texts, tokenizer, model):
    dataset = CustomDataset(texts, tokenizer)
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=1,
        per_device_train_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
    )
    trainer.train()

# Main function to process PDFs and interact with the chatbot
def main(folder_path, save_model_path=None, load_model_path=None):
    # Initialize the sentence transformer model
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

    # Extract text from all PDFs in the folder
    all_texts = extract_texts_from_folder(folder_path)
    sentences, embeddings = create_embeddings(all_texts, embedding_model)

    if not sentences:
        print("No sentences to fine-tune the model on.")
        return

    # Initialize the chatbot
    chatbot = Chatbot()

    # Fine-tune the chatbot model if load_model_path is not provided
    if load_model_path:
        chatbot.load_model(load_model_path)
    else:
        fine_tune_model(sentences, chatbot.tokenizer, chatbot.model)
        if save_model_path:
            chatbot.save_model(save_model_path)

    # Example interaction with the chatbot
    while True:
        question = input("Ask a question: ")
        if question.lower() == "exit":
            break
        response = chatbot.generate_response(question)
        print(f"Chatbot: {response}")

# Specify your folder and paths here
folder_path = r'D:\data\scdata'
save_model_path = r'D:\data'

# Paths to save/load the fine-tuned model
save_model_path = os.path.join(save_model_path, 'fine_tuned_model')
load_model_path = None  # Set to the path of the fine-tuned model if you want to load it

if __name__ == "__main__":
    main(folder_path, save_model_path=save_model_path, load_model_path=load_model_path)


Error reading D:\data\scdata\-0___jonew__judis__11340.pdf: Cannot open empty file: filename='D:\\data\\scdata\\-0___jonew__judis__11340.pdf'.
Error reading D:\data\scdata\1-0958___jonew__judis__19685.pdf: Cannot open empty file: filename='D:\\data\\scdata\\1-0958___jonew__judis__19685.pdf'.
Error reading D:\data\scdata\1-1208___jonew__judis__11339.pdf: Cannot open empty file: filename='D:\\data\\scdata\\1-1208___jonew__judis__11339.pdf'.
Error reading D:\data\scdata\1-1478___jonew__judis__20040.pdf: Cannot open empty file: filename='D:\\data\\scdata\\1-1478___jonew__judis__20040.pdf'.
Error reading D:\data\scdata\1-1945___jonew__judis__19816.pdf: Cannot open empty file: filename='D:\\data\\scdata\\1-1945___jonew__judis__19816.pdf'.
Error reading D:\data\scdata\1-6558___jonew__judis__19988.pdf: Cannot open empty file: filename='D:\\data\\scdata\\1-6558___jonew__judis__19988.pdf'.
Error reading D:\data\scdata\10032-1997___jonew__judis__20876.pdf: Cannot open empty file: filename='D:\\dat

KeyboardInterrupt: 