In [1]:
# Importing Libraries
import pandas as pd
import re
from nltk.tokenize import sent_tokenize
from tqdm.auto import tqdm
import torch
import spacy
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
from transformers import T5ForConditionalGeneration, T5Tokenizer
import os
from dotenv import load_dotenv


In [2]:
# Load environment variables
load_dotenv()
DATA_PATH = os.getenv('DATA_PATH')

In [3]:
# Load spaCy model
# nlp = spacy.load('en_core_web_sm')

# Load the data
df = pd.read_parquet(os.path.join(DATA_PATH, 'vanderbilt_dsi_blog_posts.parquet'))

# Clean the text 
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text.lower()

def is_english(text):
    try:
        return detect(text) == 'en'
    except Exception:
        return False

df.drop_duplicates(inplace=True)

df['clean_text'] = df['content'].apply(clean_text)

df['is_english'] = df['clean_text'].apply(is_english)

In [10]:
# Load the model and tokenizer (using a smaller model)
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=True)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Use MPS device if available
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = model.to(device)

def highlight_answer(sentence, answer):
    escaped_answer = re.escape(answer)
    pattern = r'\b{}\b'.format(escaped_answer)
    highlighted_sentence = re.sub(pattern, f"<hl> {answer} <hl>", sentence, count=1)
    return highlighted_sentence

def generate_qa_pairs(text):
    sentences = sent_tokenize(text)
    qa_pairs = []

    for sentence in sentences:
        doc = nlp(sentence)
        entities = [ent.text for ent in doc.ents][:2]

        inputs = []
        answers = []
        contexts = []

        for answer in entities:
            context = highlight_answer(sentence, answer)
            input_text = f"generate question: {context}"
            inputs.append(input_text)
            answers.append(answer)
            contexts.append(sentence)

        if inputs:
            input_encodings = tokenizer(inputs, return_tensors='pt', padding=True, truncation=True, max_length=512)
            input_encodings = {key: tensor.to(device) for key, tensor in input_encodings.items()}

            outputs = model.generate(
                input_ids=input_encodings['input_ids'],
                attention_mask=input_encodings['attention_mask'],
                max_length=48,
                num_beams=2,
                early_stopping=True
            )

            for i, output in enumerate(outputs):
                question = tokenizer.decode(output, skip_special_tokens=True)
                if question and '?' in question:
                    qa_pairs.append({
                        'question': question,
                        'answer': answers[i],
                        'context': contexts[i]
                    })

            del input_encodings, outputs
            torch.cuda.empty_cache()

    return qa_pairs

# Process a sample of the data
sample_size = min(500, len(df))
df_sample = df.sample(n=sample_size, random_state=42)

all_qa_pairs = []
for idx, row in tqdm(df_sample.iterrows(), total=df_sample.shape[0], desc="Generating QA pairs"):
    text = row['clean_text']
    qa_pairs = generate_qa_pairs(text)
    all_qa_pairs.extend(qa_pairs)

# Convert to DataFrame
qa_df = pd.DataFrame(all_qa_pairs)

# Save the QA pairs
qa_df.to_json(os.path.join(DATA_PATH, 'qa_pairs.jsonl'), orient='records', lines=True)


Generating QA pairs:   0%|          | 0/168 [00:00<?, ?it/s]



KeyboardInterrupt: 

## 2nd method

In [8]:
from transformers import pipeline
import torch
from tqdm import tqdm  # Import tqdm for progress tracking

# Load models
question_generator = pipeline("text2text-generation", 
                              model="valhalla/t5-small-qg-prepend",
                              device=0 if torch.cuda.is_available() else -1)

qa_extractor = pipeline("question-answering",
                        model="distilbert-base-cased-distilled-squad",
                        device=0 if torch.cuda.is_available() else -1)

qa_pairs = []

# Iterate over content with tqdm for progress tracking
for text in tqdm(df['content'], desc="Processing Content"):
    try:
        # Split text into chunks of 300 characters with some overlap
        chunk_size = 300
        overlap = 50
        chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]
        
        unique_questions = set()
        
        for chunk in chunks[:5]:  # Limit processing to 5 chunks for efficiency
            input_text = f"generate questions: {chunk}"
            
            # Generate questions for the current chunk
            questions = question_generator(
                input_text, 
                max_length=512, 
                truncation=True, 
                num_beams=5, 
                num_return_sequences=1  # Generate one question per chunk
            )
            
            # Collect unique questions
            for q in questions:
                if '?' in q['generated_text']:
                    unique_questions.add(q['generated_text'])
        
        # Extract answers for up to 5 unique questions
        for question in list(unique_questions)[:5]:
            answer = qa_extractor(question=question, context=text)
            qa_pairs.append({"question": question, "answer": answer['answer']})
    except Exception as e:
        print(f"Error processing text: {e}")
        continue

# Create a DataFrame from the QA pairs
qa_df = pd.DataFrame(qa_pairs)

# Display the first few rows
qa_df.head()


ImportError: 
 requires the protobuf library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.
