In [None]:
!pip install --upgrade  PyPDF2 pdfplumber sentence-transformers faiss-gpu-cu12 transformers torch nltk



In [None]:
import PyPDF2
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from google.colab import files

In [None]:
import sentence_transformers, transformers
print("PyPDF2:", PyPDF2.__version__)
print("NumPy:", np.__version__)
print("Sentence Transformers:", sentence_transformers.__version__)
print("FAISS:", faiss.__version__)
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)

PyPDF2: 3.0.1
NumPy: 1.26.4
Sentence Transformers: 4.0.1
FAISS: 1.10.0
Torch: 2.6.0+cu124
Transformers: 4.50.2


In [None]:
def extract_text_from_pdfs(pdf_files):
    documents = []
    for pdf_name in pdf_files:
        pdf_file = open(pdf_name, 'rb')
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        num_pages = len(pdf_reader.pages)
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text = page.extract_text()
            if text.strip():  documents.append({'text': text, 'metadata': {'pdf_name': pdf_name, 'page_number': page_num + 1}})
        pdf_file.close()
    return documents

In [None]:
pdf_files = ["pdfs/paper1.pdf", "pdfs/paper2.pdf", "pdfs/paper3.pdf", "pdfs/paper4.pdf", "pdfs/paper5.pdf", "pdfs/paper6.pdf"]
documents = extract_text_from_pdfs(pdf_files)
print(f"Extracted from {len(documents)} documents")

Extracted from 80 documents


In [None]:
documents

[{'text': 'Overview of PAN 2024:\nMulti-Author Writing Style Analysis,\nMultilingual Text Detoxification,\nOppositional Thinking Analysis, and\nGenerative AI Authorship Verification\nCondensed Lab Overview\nAbinew Ali Ayele,1Nikolay Babakov,2Janek Bevendorff,3\nXavier Bonet Casals,4Berta Chulvi,5Daryna Dementieva,6Ashaf Elnagar,7\nDayne Freitag,8Maik Fröbe,9Damir Korenčić,10Maximilian Mayerl,11\nDaniil Moskovskiy,12Animesh Mukherjee,13Alexander Panchenko,12\nMartin Potthast,14Francisco Rangel,15Naquee Rizwan,13Paolo Rosso,5,16\nFlorian Schneider,1Alisa Smirnova,17Efstathios Stamatatos,18\nElisei Stakovskii, Benno Stein,19Mariona Taulé,4Dmitry Ustalov,20\nXintong Wang,1Matti Wiegmann,19Seid Muhie Yimam,1and Eva Zangerle21\n1Universität Hamburg, Germany,2Universidade de Santiago de Compostela, Spain\n3Leipzig University, Germany,4Universitat de Barcelona, Spain,5Univ. Politècnica\nde València, Spain,6Technical University of Munich, Germany,7University of\nSharjah, United Arab Emirates,8S

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
texts = [doc['text'] for doc in documents]
embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
print(f"Generated embeddings with shape: {embeddings.shape}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Generated embeddings with shape: (80, 384)


In [None]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
res = faiss.StandardGpuResources()  # Use GPU
gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
gpu_index.add(embeddings)
print(f"FAISS index created with {gpu_index.ntotal} vectors.")

FAISS index created with 80 vectors.


In [None]:
def semantic_search(query, top_k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = gpu_index.search(query_embedding, top_k)
    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            'text': documents[idx]['text'],
            'metadata': documents[idx]['metadata'],
            'distance': distances[0][i]
        })
    return results

In [None]:
def rerank_results(query, results):
    query_embedding = model.encode([query], convert_to_numpy=True)[0]
    for result in results:
        doc_embedding = model.encode([result['text']], convert_to_numpy=True)[0]
        cosine_sim = np.dot(query_embedding, doc_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding))
        result['cosine_similarity'] = cosine_sim
    return sorted(results, key=lambda x: x['cosine_similarity'], reverse=True)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2').to('cuda')

In [None]:
def generate_answer(context, query):
    input_text = f"Question: {query}\nContext: {context[:500]}...\nAnswer:"  # Limit context length
    inputs = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512).to('cuda')
    outputs = gpt2_model.generate(
        inputs['input_ids'],
        max_length=1024,
        num_return_sequences=1,
        temperature=0.7,
        top_k=50,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.split("Answer:")[1].strip()

In [None]:
example_queries = [
    "Summarize the results of Paper 1?",
    "What methodology did Paper 2 use?",
    "Across all the papers, which paper had the best results and what techniques did they use?"
]

In [None]:
for query in example_queries:
    print(f"\nQuery: {query}")
    # Search and rerank
    initial_results = semantic_search(query, top_k=5)
    reranked_results = rerank_results(query, initial_results)
    # Generate answer from top result
    top_context = reranked_results[0]['text']
    answer = generate_answer(top_context, query)
    print(f"Generated Answer: {answer}")
    # Display top-3 results with metadata
    print("Top-3 Retrieved Results:")
    for i, result in enumerate(reranked_results[:3], 1):
        excerpt = result['text'][:200] + "..." if len(result['text']) > 200 else result['text']
        print(f"{i}. PDF: {result['metadata']['pdf_name']}, Page: {result['metadata']['page_number']}")
        print(f"   Similarity Score: {result['cosine_similarity']:.4f}")
        print(f"   Excerpt: {excerpt}")


Query: Summarize the results of Paper 1?
Generated Answer: in this context, the number of authors in a do... hashemi is also an indicator of the number of authors in a do...
3 4 5
Number of Authors0.00.20.40.60.81.0ScoreT ask 2 F1-Score Over Number of Authors
participant
sengkungsukawaka
hyoungmo
sang
sangchi

young

sangchi

young

4 5 4
Group 1 (1st) 5.7 1.7 2.0 5.7 4.0 4.0 3.0 3.0 4.0 3.0 4.0 3.0 4.0 3.0
5 6 7 8
Group 2 (2nd) 7.6 3.2 4.0 6.0 6.0 4.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100

Loading... Loading...

Quotes are not sourced from all markets and may be delayed up to 20 minutes. Information is provided 'as is' and solely for informational purposes, not for trading purposes or advice.Disclai