# 1. Reading PDFs

In [1]:
import os
path = "../../../data/examples/module_4/"
os.listdir(path)

['.DS_Store',
 'manifesto_english_speaking.parquet',
 'eu_press_releases_ghg.csv',
 'pisa',
 'manifesto_us.parquet']

In [2]:
path = "../../../data/examples/module_4/pisa/"
os.listdir(path)

['PISA 2015 Results (Volume I).pdf',
 'PISA 2022 Results (Volume I).pdf',
 'PISA 2006 Results (Volume I).pdf']

## 1.1 Read: Pypdf2

In [3]:
import os
from PyPDF2 import PdfReader

def read_pdf(file_path):
    result = {}
    with open(file_path, 'rb') as f:
        pdf = PdfReader(f)
        for page_num, page in enumerate(pdf.pages, start=1):
            result[f"Page {page_num}"] = page.extract_text()
    return result

# Usage
pdf_pisa_2015 = read_pdf(f"{path}/PISA 2015 Results (Volume I).pdf")

## 1.2 Read: pdfplumber

In [4]:
import pdfplumber
import os

def read_pdf(file_path):
    result = {}
    
    with pdfplumber.open(file_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            result[f"Page {page_num}"] = text if text else ""
    
    return result

# Usage
pdf_pisa_2015 = read_pdf(f"{path}/PISA 2015 Results (Volume I).pdf")

# 2. Rag

## 2.1 RAG: Basic word counts and weighted word counts (TF-IDF)

### 2.1.1 Generate understanding of document: Word counts

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline

def setup_rag(pdf_content, vectorizer_type='tfidf', max_words=500):
    # Extract the documents from the pdf_content dictionary
    documents = list(pdf_content.values())
    # Extract the page keys from the pdf_content dictionary
    page_keys = list(pdf_content.keys())

    # Check the type of vectorizer to use
    if vectorizer_type == 'tfidf':
        # Initialize a TfidfVectorizer with English stop words and 1000 max features
        vectorizer = TfidfVectorizer(stop_words='english', max_features=max_words)
    elif vectorizer_type == 'word_counts':
        # Initialize a CountVectorizer with English stop words and 1000 max features
        vectorizer = CountVectorizer(stop_words='english', max_features=max_words)
    else:
        # Raise an error if the vectorizer type is not 'tfidf' or 'word_counts'
        raise ValueError("vectorizer_type must be 'tfidf' or 'word_counts'")

    # Fit the vectorizer to the documents and transform them into vectors
    doc_vectors = vectorizer.fit_transform(documents)

    # Create a DataFrame from the document vectors
    df = pd.DataFrame(
        # Convert the document vectors to a numpy array
        doc_vectors.toarray(), 
        # Use the feature names (words) as column names
        columns=vectorizer.get_feature_names_out(), 
    )
    # Set the index of the DataFrame to the page numbers (starting from 1)
    df.index = [num+1 for num in df.index]
    # Set the name of the index to 'pdf_page_number'
    df.index.name = 'pdf_page_number'
    df.columns.name = 'word frequency from document'
    # Return the vectorizer, document vectors, documents, page keys, qa_pipeline, and DataFrame
    return vectorizer, doc_vectors, documents, page_keys, df

# Call the setup_rag function with the pdf_pisa_2015 content and 'word_counts' vectorizer type
vectorizer, doc_vectors, documents, page_keys, df = setup_rag(
    pdf_content = pdf_pisa_2015,
    vectorizer_type = 'word_counts',
    max_words = 500
)
# Display the first two rows of the DataFrame
df.head(2)

word frequency from document,00,000,01,02,03,04,05,06,07,08,...,values,variation,viet,vol,volume,web,work,year,years,zealand
pdf_page_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2.1.2 Query undersanding of document: Look for similar word counts

In [6]:
def retrieve_documents(query, vectorizer, doc_vectors, documents, page_keys, number_of_results=3):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, doc_vectors).flatten()
    top_doc_indices = similarities.argsort()[-number_of_results:][::-1]

    retrieved = []
    for idx in top_doc_indices:
        if similarities[idx] > 0:
            retrieved.append({
                'page': page_keys[idx],
                'content': documents[idx],
                'similarity': similarities[idx]
            })
    return pd.DataFrame(retrieved)

pd.set_option('max_colwidth', 400)


retrieve_documents(
    query = "How do economic conditions impact a country's pisa score?",
    vectorizer = vectorizer,
    doc_vectors = doc_vectors,
    documents = documents,
    page_keys = page_keys,
    number_of_results = 5
)

Unnamed: 0,page,content,similarity
0,Page 180,"5\nMATHEMATICS PERFORMANCE AMONG 15‑YEAR‑OLDS\nWhen interpreting mean performance, only statistically significant differences among countries and economies should\nbe taken into account (see Box I.2.2 in Chapter 2). Figure I.5.1 shows each country’s/economy’s mean score and\nalso indicates for which pairs of countries/economies the differences between the means are statistically significant.\n...",0.42119
1,Page 189,5\nMATHEMATICS PERFORMANCE AMONG 15‑YEAR‑OLDS\nFigure I.5.5 • RReellaattiioonnsshhiipp bbeettwweeeenn cchhaannggee iinn mmaatthheemmaattiiccss ppeerrffoorrmmaannccee\naanndd aavveerraaggee PPIISSAA 22001122 mmaatthheemmaattiiccss ssccoorreess\nNotes: Score-point difference in mathematics between PISA 2012 and PISA 2015 that are statistically significant are indicated in a darker tone (see Anne...,0.411665
2,Page 216,"6\nSOCIO‑ECONOMIC STATUS, STUDENT PERFORMANCE AND STUDENTS’ ATTITUDES TOWARDS SCIENCE\nBox I.6.3. Assessing the skills of non‑enrolled students in PISA for Development\nThe PISA for Development (PISA-D) initiative launched by the OECD and its partners aims to make PISA more\naccessible and relevant to low- and middle-income countries. PISA-D is enabling a wider range of countries to use\nPISA ...",0.409294
3,Page 12,TABLE OF CONTENTS\nANNEX B PISA 2015 DATA ...................................................................................................................................................................................................319\nAnnex B1 Results for countries and economies ................................................................................................................,0.408403
4,Page 67,"2\nSCIENCE PERFORMANCE AMONG 15‑YEAR‑OLDS\nFifteen-year-old students who sit the PISA test may be enrolled in one of two or more grade levels. Based on this\nvariation, past reports have estimated the average score-point difference across adjacent grades for countries in which\na sizeable number of 15-year-olds are enrolled in at least two different grades. These estimates take into account so...",0.408154


## 2.2 RAG: LLM word embeddings

### 2.2.1 Generate understanding of document: word embeddings. 


#### Word embeddings are more sophisticated than word counts because they take into account relationship between words! 



In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import warnings
warnings.filterwarnings('ignore')

def setup_rag(pdf_content, model_name='distilbert-base-uncased', max_length=512):
    # Extract documents and page keys
    documents = list(pdf_content.values())
    page_keys = list(pdf_content.keys())
    
    # Load DistilBERT model and tokenizer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    
    # Generate embeddings for all documents
    doc_embeddings = []
    
    print(f"Generating embeddings for {len(documents)} documents...")
    
    with torch.no_grad():
        for i, doc in enumerate(documents):
            # Tokenize document
            inputs = tokenizer(
                doc, 
                return_tensors='pt', 
                truncation=True, 
                padding=True, 
                max_length=max_length
            ).to(device)
            
            # Get embeddings
            outputs = model(**inputs)
            # Use mean pooling of last hidden states
            embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            doc_embeddings.append(embeddings.flatten())
            
            if (i + 1) % 100 == 0:
                print(f"Processed {i + 1}/{len(documents)} documents")
    
    # Convert to numpy array
    doc_embeddings = np.array(doc_embeddings)
    
    # Create DataFrame for visualization (showing embedding dimensions)
    df = pd.DataFrame(
        doc_embeddings,
        columns=[f'dim_{i}' for i in range(doc_embeddings.shape[1])],
        index=[num+1 for num in range(len(documents))]
    )
    df.index.name = 'pdf_page_number'
    df.columns.name = 'embedding_dimensions'
    
    print(f"Setup complete! Generated {doc_embeddings.shape[1]}-dimensional embeddings")
    
    return tokenizer, model, doc_embeddings, documents, page_keys, df

In [8]:
tokenizer, model, doc_embeddings, documents, page_keys, df = setup_rag(
    pdf_content=pdf_pisa_2015
)
df.head(4)

Generating embeddings for 494 documents...
Processed 100/494 documents
Processed 200/494 documents
Processed 300/494 documents
Processed 400/494 documents
Setup complete! Generated 768-dimensional embeddings


embedding_dimensions,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_758,dim_759,dim_760,dim_761,dim_762,dim_763,dim_764,dim_765,dim_766,dim_767
pdf_page_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.075916,0.238033,0.548112,-0.246915,0.16713,-0.026873,0.585249,0.408631,-0.443284,0.192898,...,-0.121749,0.184919,0.249631,-0.441683,0.059029,-0.640974,0.172856,-0.432192,-0.230333,0.331368
2,0.326434,0.015919,-0.135683,0.032392,-0.217147,-0.323166,0.175598,-0.138356,0.153253,-0.052729,...,0.002634,0.053211,-0.095615,-0.398742,0.063583,-0.431636,-0.040238,0.104947,-0.273906,-0.039879
3,-0.415022,-0.198494,0.258769,0.165987,0.204136,-0.001144,0.066349,0.320715,-0.141585,-0.246771,...,0.370273,0.016788,0.196641,-0.490617,0.112855,-0.518742,-0.070396,-0.21531,0.073205,0.226541
4,-0.279163,0.133799,0.257101,-0.066509,0.373346,-0.284673,-0.073481,0.540508,-0.024694,-0.063591,...,0.075241,-0.005007,0.10337,-0.486346,-0.064123,-0.421961,-0.181182,-0.140332,-0.034797,0.077958


### 2.2.2 Query undersanding of document: Look for embeddings

In [9]:
def retrieve_documents(query, tokenizer, model, doc_embeddings, documents, page_keys, number_of_results=3):
    device = next(model.parameters()).device    
    # Generate query embedding
    with torch.no_grad():
        inputs = tokenizer(
            query, 
            return_tensors='pt', 
            truncation=True, 
            padding=True, 
            max_length=512
        ).to(device)
        outputs = model(**inputs)
        query_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten()
    
    # Calculate cosine similarities
    similarities = cosine_similarity([query_embedding], doc_embeddings).flatten()
    # Get top results
    top_doc_indices = similarities.argsort()[-number_of_results:][::-1]
    # Build results
    retrieved = []
    for idx in top_doc_indices:
        if similarities[idx] > 0:  # Only include positive similarities
            retrieved.append({
                'page': page_keys[idx],
                'content': documents[idx],
                'similarity': similarities[idx]
            })
    
    return pd.DataFrame(retrieved)


# Retrieve documents
pd.set_option('max_colwidth', 400)
df_results = retrieve_documents(
    query="How do economic conditions impact a country's pisa score?",
    tokenizer=tokenizer,
    model=model,
    doc_embeddings=doc_embeddings,
    documents=documents,
    page_keys=page_keys,
    number_of_results=5
)
df_results

Unnamed: 0,page,content,similarity
0,Page 203,"6\nSocio‑economic status,\nstudent performance and students’\nattitudes towards science\nThis chapter defines the dimensions of equity in education: inclusiveness\nand fairness. It first discusses 15-year-olds’ access to schooling in PISA-\nparticipating countries and economies, and then describes how the\nsocio-economic status of students and schools is related to student\nperformance and stu...",0.817688
1,Page 243,"7\nImmigrant background,\nstudent performance and students’\nattitudes towards science\nThis chapter examines differences in performance and attitudes towards\nscience in PISA 2015 by students’ immigrant background. It discusses\nrecent trends in immigration in PISA-participating countries and economies,\nand highlights factors associated with low performance among immigrant\nstudents, includi...",0.811674
2,Page 214,"6\nSOCIO‑ECONOMIC STATUS, STUDENT PERFORMANCE AND STUDENTS’ ATTITUDES TOWARDS SCIENCE\nHong Kong (China), Korea and Viet Nam are all high performers in PISA, with average scores ranging from 515 to\n525 points in science; but while coverage rates stand around 90% in both Hong Kong (China) and Korea, they are only\n64% in B-S-J-G (China) and 49% in Viet Nam (Table I.6.1).\nMoreover, when compar...",0.811078
3,Page 147,"4\nReading performance\namong 15‑year‑olds\nHow well can 15-year-old students understand, use, reflect on and engage\nwith written texts? This chapter compares countries’ and economies’\nperformance in reading in 2015 and analyses changes over the various\nPISA assessments. It highlights the differences between girls’ and boys’\nperformance.\nA note regarding Israel\nThe statistical data for I...",0.810752
4,Page 177,"5\nMathematics performance\namong 15‑year‑olds\nThis chapter compares countries’ and economies’ performance in\nmathematics in 2015 and analyses the changes in performance since\n2003. Changes since the PISA 2012 assessment, when mathematics\nwas most recently the major domain, are highlighted. The chapter also\ndiscusses differences in mathematics performance related to gender.\nA note regard...",0.810587


### 2.2.3 Add a column that summarizes the content using hugging face summarization

In [10]:
import warnings
warnings.filterwarnings("ignore")

from transformers import pipeline
summarizer_bart = pipeline("summarization", model="facebook/bart-large-cnn")

df_results['llm_summary_bart'] = df_results['content'].apply(
        lambda text: summarizer_bart(text, #max_length=256, max_new_tokens = 256,
                                min_length=10, do_sample=False)[0]['summary_text']
    )
df_results

Device set to use mps:0


Unnamed: 0,page,content,similarity,llm_summary_bart
0,Page 203,"6\nSocio‑economic status,\nstudent performance and students’\nattitudes towards science\nThis chapter defines the dimensions of equity in education: inclusiveness\nand fairness. It first discusses 15-year-olds’ access to schooling in PISA-\nparticipating countries and economies, and then describes how the\nsocio-economic status of students and schools is related to student\nperformance and stu...",0.817688,This chapter defines the dimensions of equity in education: inclusiveness and fairness. It first discusses 15-year-olds’ access to schooling in PISA-participating countries and economies. Then describes how the economic status of students and schools is related to student performance and students’ attitudes towards science.
1,Page 243,"7\nImmigrant background,\nstudent performance and students’\nattitudes towards science\nThis chapter examines differences in performance and attitudes towards\nscience in PISA 2015 by students’ immigrant background. It discusses\nrecent trends in immigration in PISA-participating countries and economies,\nand highlights factors associated with low performance among immigrant\nstudents, includi...",0.811674,This chapter examines differences in performance and attitudes towards science in PISA 2015 by students’ immigrant background. It discusses recent trends in immigration in countries and economies and highlights factors associated with low performance among immigrant students. The statistical data for Israel are supplied by and under the responsibility of the relevant Israeli authorities.
2,Page 214,"6\nSOCIO‑ECONOMIC STATUS, STUDENT PERFORMANCE AND STUDENTS’ ATTITUDES TOWARDS SCIENCE\nHong Kong (China), Korea and Viet Nam are all high performers in PISA, with average scores ranging from 515 to\n525 points in science; but while coverage rates stand around 90% in both Hong Kong (China) and Korea, they are only\n64% in B-S-J-G (China) and 49% in Viet Nam (Table I.6.1).\nMoreover, when compar...",0.811078,"PISA coverage rates stand around 90% in both Hong Kong (China) and Korea, but only 49% in Viet Nam. As previously omitted student populations gain access to school, a larger proportion of low-performing students will be included in PISA samples. Low coverage can also have an impact on the analysis of equity outcomes within or between countries and economies."
3,Page 147,"4\nReading performance\namong 15‑year‑olds\nHow well can 15-year-old students understand, use, reflect on and engage\nwith written texts? This chapter compares countries’ and economies’\nperformance in reading in 2015 and analyses changes over the various\nPISA assessments. It highlights the differences between girls’ and boys’\nperformance.\nA note regarding Israel\nThe statistical data for I...",0.810752,This chapter compares countries’ and economies’performance in reading in 2015 and analyses changes over the various PISA assessments. The statistical data for Israel are supplied by the relevant Israeli authorities.
4,Page 177,"5\nMathematics performance\namong 15‑year‑olds\nThis chapter compares countries’ and economies’ performance in\nmathematics in 2015 and analyses the changes in performance since\n2003. Changes since the PISA 2012 assessment, when mathematics\nwas most recently the major domain, are highlighted. The chapter also\ndiscusses differences in mathematics performance related to gender.\nA note regard...",0.810587,"This chapter compares countries’ and economies’ performance in mathematics in 2015 and analyses the changes in performance since 2003. Changes since the PISA 2012 assessment, when mathematics was most recently the major domain, are highlighted. The chapter also discusses differences in mathematics performance related to gender."


In [11]:
summarizer_t5 = pipeline("summarization", model="t5-small")

df_results['llm_summary_t5'] = df_results['content'].apply(
        lambda text: summarizer_t5(text, #max_length=256, max_new_tokens = 256,
                                min_length=10, do_sample=False)[0]['summary_text']
    )
df_results

Device set to use mps:0
Your max_length is set to 200, but your input_length is only 180. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=90)
Your max_length is set to 200, but your input_length is only 184. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=92)
Token indices sequence length is longer than the specified maximum sequence length for this model (1282 > 512). Running this sequence through the model will result in indexing errors
Your max_length is set to 200, but your input_length is only 167. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=83)
Your max_length is set to 200, but your inp

Unnamed: 0,page,content,similarity,llm_summary_bart,llm_summary_t5
0,Page 203,"6\nSocio‑economic status,\nstudent performance and students’\nattitudes towards science\nThis chapter defines the dimensions of equity in education: inclusiveness\nand fairness. It first discusses 15-year-olds’ access to schooling in PISA-\nparticipating countries and economies, and then describes how the\nsocio-economic status of students and schools is related to student\nperformance and stu...",0.817688,This chapter defines the dimensions of equity in education: inclusiveness and fairness. It first discusses 15-year-olds’ access to schooling in PISA-participating countries and economies. Then describes how the economic status of students and schools is related to student performance and students’ attitudes towards science.,the chapter defines the dimensions of equity in education: inclusiveness and fairness . it first discusses 15-year-olds’ access to schooling in PISA- participating countries and economies . then describes how the socio-economic status of students and schools is related to student performance and students’ attitudes towards science . the use of such data by the OECD is without prejudice to the ...
1,Page 243,"7\nImmigrant background,\nstudent performance and students’\nattitudes towards science\nThis chapter examines differences in performance and attitudes towards\nscience in PISA 2015 by students’ immigrant background. It discusses\nrecent trends in immigration in PISA-participating countries and economies,\nand highlights factors associated with low performance among immigrant\nstudents, includi...",0.811674,This chapter examines differences in performance and attitudes towards science in PISA 2015 by students’ immigrant background. It discusses recent trends in immigration in countries and economies and highlights factors associated with low performance among immigrant students. The statistical data for Israel are supplied by and under the responsibility of the relevant Israeli authorities.,"students’ immigrant background, student performance and students’ attitudes towards science in PISA 2015 . this chapter examines differences in performance and attitudes toward science in OECD 2016 . it discusses recent trends in immigration in pISA-participating countries ."
2,Page 214,"6\nSOCIO‑ECONOMIC STATUS, STUDENT PERFORMANCE AND STUDENTS’ ATTITUDES TOWARDS SCIENCE\nHong Kong (China), Korea and Viet Nam are all high performers in PISA, with average scores ranging from 515 to\n525 points in science; but while coverage rates stand around 90% in both Hong Kong (China) and Korea, they are only\n64% in B-S-J-G (China) and 49% in Viet Nam (Table I.6.1).\nMoreover, when compar...",0.811078,"PISA coverage rates stand around 90% in both Hong Kong (China) and Korea, but only 49% in Viet Nam. As previously omitted student populations gain access to school, a larger proportion of low-performing students will be included in PISA samples. Low coverage can also have an impact on the analysis of equity outcomes within or between countries and economies.",low coverage can lead to an underestimation of real improvements achieved by education systems that expanded access to schooling and/or improved performance over time . there is a range of analytical strategies to estimate the impact that using proxy results for out-of-school 15-year-olds can have on an education system’s mean performance in PISA . this means that disadvantaged youth are more ...
3,Page 147,"4\nReading performance\namong 15‑year‑olds\nHow well can 15-year-old students understand, use, reflect on and engage\nwith written texts? This chapter compares countries’ and economies’\nperformance in reading in 2015 and analyses changes over the various\nPISA assessments. It highlights the differences between girls’ and boys’\nperformance.\nA note regarding Israel\nThe statistical data for I...",0.810752,This chapter compares countries’ and economies’performance in reading in 2015 and analyses changes over the various PISA assessments. The statistical data for Israel are supplied by the relevant Israeli authorities.,this chapter compares countries’ and economies’ performance in reading in 2015 and analyses changes over the various PISA assessments . it highlights the differences between girls’ and boys’ performance .
4,Page 177,"5\nMathematics performance\namong 15‑year‑olds\nThis chapter compares countries’ and economies’ performance in\nmathematics in 2015 and analyses the changes in performance since\n2003. Changes since the PISA 2012 assessment, when mathematics\nwas most recently the major domain, are highlighted. The chapter also\ndiscusses differences in mathematics performance related to gender.\nA note regard...",0.810587,"This chapter compares countries’ and economies’ performance in mathematics in 2015 and analyses the changes in performance since 2003. Changes since the PISA 2012 assessment, when mathematics was most recently the major domain, are highlighted. The chapter also discusses differences in mathematics performance related to gender.",this chapter compares countries’ and economies’ performance in mathematics in 2015 and analyses changes in performance since 2003 . changes in mathematics performance related to gender are highlighted .


In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def summarize_text(text, model_name="t5-small", min_length=10):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    input_ids = tokenizer("summarize: " + text, return_tensors="pt").input_ids
    summary_ids = model.generate(input_ids, min_length=min_length, do_sample=False)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

df_results['llm_summary_t5'] = df_results['content'].apply(summarize_text)
df_results

Token indices sequence length is longer than the specified maximum sequence length for this model (1282 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,page,content,similarity,llm_summary_bart,llm_summary_t5
0,Page 203,"6\nSocio‑economic status,\nstudent performance and students’\nattitudes towards science\nThis chapter defines the dimensions of equity in education: inclusiveness\nand fairness. It first discusses 15-year-olds’ access to schooling in PISA-\nparticipating countries and economies, and then describes how the\nsocio-economic status of students and schools is related to student\nperformance and stu...",0.817688,This chapter defines the dimensions of equity in education: inclusiveness and fairness. It first discusses 15-year-olds’ access to schooling in PISA-participating countries and economies. Then describes how the economic status of students and schools is related to student performance and students’ attitudes towards science.,the chapter defines the dimensions of equity in education: inclusiveness and fairness. it first discusse
1,Page 243,"7\nImmigrant background,\nstudent performance and students’\nattitudes towards science\nThis chapter examines differences in performance and attitudes towards\nscience in PISA 2015 by students’ immigrant background. It discusses\nrecent trends in immigration in PISA-participating countries and economies,\nand highlights factors associated with low performance among immigrant\nstudents, includi...",0.811674,This chapter examines differences in performance and attitudes towards science in PISA 2015 by students’ immigrant background. It discusses recent trends in immigration in countries and economies and highlights factors associated with low performance among immigrant students. The statistical data for Israel are supplied by and under the responsibility of the relevant Israeli authorities.,"students’ immigrant background, student performance and students’ attitudes towards science in PISA 2015"
2,Page 214,"6\nSOCIO‑ECONOMIC STATUS, STUDENT PERFORMANCE AND STUDENTS’ ATTITUDES TOWARDS SCIENCE\nHong Kong (China), Korea and Viet Nam are all high performers in PISA, with average scores ranging from 515 to\n525 points in science; but while coverage rates stand around 90% in both Hong Kong (China) and Korea, they are only\n64% in B-S-J-G (China) and 49% in Viet Nam (Table I.6.1).\nMoreover, when compar...",0.811078,"PISA coverage rates stand around 90% in both Hong Kong (China) and Korea, but only 49% in Viet Nam. As previously omitted student populations gain access to school, a larger proportion of low-performing students will be included in PISA samples. Low coverage can also have an impact on the analysis of equity outcomes within or between countries and economies.",a range of analytical strategies to estimate the impact of low coverage can have on an education system’
3,Page 147,"4\nReading performance\namong 15‑year‑olds\nHow well can 15-year-old students understand, use, reflect on and engage\nwith written texts? This chapter compares countries’ and economies’\nperformance in reading in 2015 and analyses changes over the various\nPISA assessments. It highlights the differences between girls’ and boys’\nperformance.\nA note regarding Israel\nThe statistical data for I...",0.810752,This chapter compares countries’ and economies’performance in reading in 2015 and analyses changes over the various PISA assessments. The statistical data for Israel are supplied by the relevant Israeli authorities.,this chapter compares countries’ and economies’ performance in reading in 2015. it analyses changes over
4,Page 177,"5\nMathematics performance\namong 15‑year‑olds\nThis chapter compares countries’ and economies’ performance in\nmathematics in 2015 and analyses the changes in performance since\n2003. Changes since the PISA 2012 assessment, when mathematics\nwas most recently the major domain, are highlighted. The chapter also\ndiscusses differences in mathematics performance related to gender.\nA note regard...",0.810587,"This chapter compares countries’ and economies’ performance in mathematics in 2015 and analyses the changes in performance since 2003. Changes since the PISA 2012 assessment, when mathematics was most recently the major domain, are highlighted. The chapter also discusses differences in mathematics performance related to gender.","the OECD uses the data without prejudice to the status of the Golan Heights, East Jerusalem"


# 3. Generative AI

## 3.1 On your local machine: THIS MODEL IS SO LARGE IT MAY CAUSE YOUR COMPUTER TO CRASH

In [14]:
from transformers import pipeline
pipe = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", device=-1)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

Device set to use cpu


[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
   {'role': 'assistant',
    'content': "Greetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have.\n</think>\n\nGreetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have."}]}]

In [22]:
from transformers import pipeline
import time

INPUT_QUESTION = 'What is the meaning of life?'

print("Starting text generation pipeline setup...")
start_time = time.time()
pipe = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", device=-1)
setup_end_time = time.time()
print(f"Pipeline setup completed in {setup_end_time - start_time:.2f} seconds.")

messages = [
    {"role": "user", "content": INPUT_QUESTION},
]

print("\nStarting text generation...")
generation_start_time = time.time()
output = pipe(messages)
generation_end_time = time.time()
print(f"Text generation completed in {generation_end_time - generation_start_time:.2f} seconds.")

print("\nGenerated Output:")
print(output)

Starting text generation pipeline setup...


Device set to use cpu


Pipeline setup completed in 18.87 seconds.

Starting text generation...
Text generation completed in 62.77 seconds.

Generated Output:
[{'generated_text': [{'role': 'user', 'content': 'What is the meaning of life?'}, {'role': 'assistant', 'content': 'Okay, so I\'m trying to figure out what the question is asking. It\'s asking, "What is the meaning of life?" That\'s a pretty broad and deep question. I know that the meaning of life is a big topic in philosophy and theology, but I\'m not entirely sure about all the different viewpoints. Let me think through this step by step.\n\nFirst, I remember that there are different branches of philosophy that approach this question. I think the main ones are existentialism, virtue ethics, and theism. Maybe there\'s also the concept of meaning as happiness or fulfillment. I should consider each of these perspectives and see how they approach the question.\n\nExistentialism, as I recall, emphasizes the individual\'s search for meaning in their own lif

## 3.2 Using GPU in the cloud

In [17]:
from huggingface_hub import InferenceClient
from dotenv import load_dotenv
load_dotenv()

client = InferenceClient(
    provider="together",
    api_key=os.getenv("HF_API")
)

completion = client.chat.completions.create(
    model="mistralai/Mistral-7B-Instruct-v0.3",
    messages=[
        {
            "role": "user",
            "content": "What is the capital of France?"
        }
    ],
)

print(completion.choices[0].message)

ChatCompletionOutputMessage(role='assistant', content=' The capital of France is Paris. It is one of the most famous cities in the world, known for its rich history, art, culture, and landmarks such as the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral. Paris is also the political, economic, and cultural center of France.', tool_call_id=None, tool_calls=[])


## 3.3 Using closed source model: Claude

- https://docs.anthropic.com/en/docs/about-claude/models/overview

In [18]:
import anthropic
import os
from dotenv import load_dotenv

load_dotenv()
client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API"))

def llm_claude(input_text, claude_model="claude-3-haiku"):
    messages = [
        {"role": "user", "content": input_text}
    ]
    chat_completion = client.messages.create(
        model=claude_model,
        max_tokens=100,
        messages=messages
    )
    return chat_completion

# Fixed function call - using the correct function name
claude_response = llm_claude(
    input_text="Tell me a fun fact about Japan.", 
    claude_model="claude-3-5-sonnet-20241022"
)

# Access the response content
print(claude_response.content[0].text)

Here's a fun fact: In Japan, there is an island called Ōkunoshima, also known as "Rabbit Island," which is home to hundreds of friendly wild rabbits. The island was once used as a secret military site for chemical weapons during World War II, but today it's a popular tourist destination where visitors can feed and interact with the numerous rabbits that roam freely throughout the island. No one knows exactly how the rabbits got there, but one theory suggests


# 4. Frontier

## 4.A Agents overview


## Agency Levels Overview

| Level | Description | Pattern Name | Example |
|-------|-------------|--------------|---------|
| ☆☆☆ | No program flow impact | Simple processor | `process_llm_output()` |
| ★☆☆ | Basic control flow | Router | `if llm_decision(): path_a()` |
| ★★☆ | Function execution | Tool caller | `run_function(tool, args)` |
| ★★★ | Iteration control | Multi-step Agent | `while llm_continue()` |
| ★★★ | Workflow orchestration | Multi-Agent | `if trigger(): execute_agent()` |

## Simple Examples with Smolagents

### ☆☆☆ Simple Processor
```python
from smolagents import CodeAgent

agent = CodeAgent()
result = agent.run("Analyze this text: 'Hello world'")
# Agent just processes input, no control flow impact
print(f"Analysis: {result}")
```

### ★☆☆ Router
```python
from smolagents import CodeAgent

agent = CodeAgent()
decision = agent.run("Should I use method A or B for this task?")

if "method A" in decision.lower():
    execute_method_a()
else:
    execute_method_b()
```

### ★★☆ Tool Caller
```python
from smolagents import CodeAgent, Tool

class Calculator(Tool):
    name = "calculator"
    def forward(self, expression: str):
        return eval(expression)

agent = CodeAgent(tools=[Calculator()])
result = agent.run("Calculate 25 * 4 + 10")
# Agent chooses tool and arguments automatically
```

### ★★★ Multi-step Agent
```python
from smolagents import CodeAgent

agent = CodeAgent()
task = "Write a complete data analysis report"

while True:
    step = agent.run(f"Next step for: {task}")
    if "complete" in step.lower():
        break
    execute_step(step)
```

### ★★★ Multi-Agent Orchestration
```python
from smolagents import CodeAgent

main_agent = CodeAgent()
data_agent = CodeAgent()
viz_agent = CodeAgent()

decision = main_agent.run("What analysis do we need?")

if "data processing" in decision:
    data_result = data_agent.run("Process the dataset")
    
if "visualization" in decision:
    viz_result = viz_agent.run("Create charts from data")
```

## Useful Links

- **Smolagents Documentation**: https://huggingface.co/docs/smolagents
- **GitHub Repository**: https://github.com/huggingface/smolagents
- **Getting Started Guide**: https://huggingface.co/docs/smolagents/quickstart
- **Tool Creation Tutorial**: https://huggingface.co/docs/smolagents/tools
- **Agent Examples**: https://huggingface.co/docs/smolagents/examples

## 4.1 Agent: Smolagents on PISA PDF

### 4.1.1 Chunk document

In [23]:
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.retrievers import BM25Retriever
# Read your PDF
pdf_pisa_2015 = read_pdf(f"{path}/PISA 2015 Results (Volume I).pdf")


# Convert PDF pages to Document objects
source_docs = [
    Document(
        page_content=text, 
        metadata={
            "source": "PISA_2015", 
            "page": page_key,
            "file_name": "PISA 2015 Results (Volume I).pdf"
        }
    )
    for page_key, text in pdf_pisa_2015.items()
    if text and text.strip()  # Only include pages with actual content
]

In [25]:
# Split documents into smaller chunks for better retrieval
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Characters per chunk
    chunk_overlap=50,  # Overlap between chunks to maintain context
    add_start_index=True,
    strip_whitespace=True,
    separators=["\n\n", "\n", ".", " ", ""],  # Priority order for splitting
)
docs_processed = text_splitter.split_documents(source_docs)

print(f"PDF knowledge base prepared with {len(docs_processed)} document chunks from {len(source_docs)} pages")

# Now you can create your retriever with the PDF content
retriever = BM25Retriever.from_documents(docs_processed)

PDF knowledge base prepared with 5364 document chunks from 486 pages


### 4.1.2 Create retriver tool

In [27]:
from smolagents import Tool

class RetrieverTool(Tool):
    name = "retriever"
    description = "Uses semantic search to retrieve the parts of the PISA 2015 Results document that could be most relevant to answer your query about PISA findings, educational outcomes, and student performance data."
    inputs = {
        "query": {
            "type": "string",
            "description": "The query to perform. This should be semantically close to your target content in the PISA 2015 document. Use the affirmative form rather than a question.",
        }
    }
    output_type = "string"

    def __init__(self, docs, **kwargs):
        super().__init__(**kwargs)
        # Initialize the retriever with our processed documents
        self.retriever = BM25Retriever.from_documents(
            docs, k=10  # Return top 10 most relevant documents
        )

    def forward(self, query: str) -> str:
        """Execute the retrieval based on the provided query."""
        assert isinstance(query, str), "Your search query must be a string"

        # Retrieve relevant documents
        docs = self.retriever.invoke(query)

        # Format the retrieved documents for readability with page information
        return "\nRetrieved documents from PISA 2015:\n" + "".join(
            [
                f"\n\n===== Document {str(i)} (from {doc.metadata.get('page', 'Unknown page')}) =====\n" + doc.page_content
                for i, doc in enumerate(docs)
            ]
        )

# Initialize our retriever tool with the processed documents
retriever_tool = RetrieverTool(docs_processed)

### 4.1.3 Place Retriver tool in agent

In [31]:
from smolagents import InferenceClientModel, CodeAgent

# Initialize the agent with our retriever tool
agent = CodeAgent(
    tools=[retriever_tool],  # List of tools available to the agent
    model=InferenceClientModel(api_key = os.getenv("HF_API")),  # Default model "Qwen/Qwen2.5-Coder-32B-Instruct"
    max_steps=2,  # Limit the number of reasoning steps
    verbosity_level=2,  # Show detailed agent reasoning
)

# To use a specific model, you can specify it like this:
model=InferenceClientModel(model_id="meta-llama/Llama-3.3-70B-Instruct")

### 4.1.4 Run agent!

In [32]:
# Ask a question that requires retrieving information
question = "How do economic conditions impact a country's pisa score?"

# Run the agent to get an answer
agent_output = agent.run(question)

# Display the final answer
print("\nFinal answer:")
print(agent_output)


Final answer:
The impact of economic conditions on a country's PISA scores is nuanced and generally shows a moderate relationship, according to the PISA 2015 data. Specifically:

1. **Moderate Correlation**: Overall, there are moderate correlation coefficients (with values r<0.5), indicating that differences in countries’ socio-economic conditions play a relatively minor role in explaining levels of equity in education and PISA performance.

2. **Variation Across Countries**: Countries with similar levels of economic development, investment in education, and socio-economic diversity can have both more and less equitable school systems and PISA scores. This suggests that while economic conditions have an influence, they are not the sole determinant.

3. **Socio-economic Impact on Opportunities**: Despite overall economic and educational investments, socio-economic status continues to have an impact on students' performance. In many countries, even when the education system performs wel

## 4.2 Agent: Claude agent on ARXIV

In [33]:
import os
from langchain_anthropic import ChatAnthropic
from langchain_community.tools import ArxivQueryRun
from langchain_community.utilities import ArxivAPIWrapper
from langchain.agents import create_react_agent, AgentExecutor
from langchain import hub
from langchain.callbacks import get_openai_callback
from dotenv import load_dotenv

load_dotenv()

def search_with_claude_tracked(query):
    """
    Enhanced arXiv search using Claude with token tracking
    """
    api_key = os.getenv("CLAUDE_API")
    
    # Initialize Claude LLM
    llm = ChatAnthropic(
        model="claude-3-5-sonnet-20241022",
        anthropic_api_key=api_key,
        temperature=0.2,
        max_tokens=1000
    )
    
    # Initialize arXiv tool
    arxiv_tool = ArxivQueryRun(
        api_wrapper=ArxivAPIWrapper(
            top_k_results=3, 
            doc_content_chars_max=800
        )
    )
    
    # Get the ReAct prompt template
    prompt = hub.pull("hwchase17/react")
    
    # Create agent
    agent = create_react_agent(llm, [arxiv_tool], prompt)
    
    # Create agent executor
    agent_executor = AgentExecutor(
        agent=agent,
        tools=[arxiv_tool],
        verbose=True,
        handle_parsing_errors=True
    )
    
    # Track tokens using callback (note: this works better with OpenAI models)
    try:
        with get_openai_callback() as cb:
            result = agent_executor.invoke({
                "input": f"Search arXiv for papers about: {query}. Provide a summary of the most relevant findings."
            })
            
            # Print token usage info
            print(f"\nToken Usage:")
            print(f"Total Tokens: {cb.total_tokens}")
            print(f"Prompt Tokens: {cb.prompt_tokens}")
            print(f"Completion Tokens: {cb.completion_tokens}")
            print(f"Total Cost (USD): ${cb.total_cost}")
            
        return result["output"]
    except Exception as e:
        return f"Error occurred: {str(e)}"

# Run the search
result = search_with_claude_tracked("quantum computing")
print("\nSearch Results:")
print("=" * 50)
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll search arXiv for papers about quantum computing and summarize the key findings.

Action: arxiv
Action Input: quantum computing
[0m[36;1m[1;3mNo good Arxiv Result was found[0m[32;1m[1;3mLet me try a more specific search query to get better results.

Action: arxiv
Action Input: "quantum computation recent advances"
[0m[36;1m[1;3mNo good Arxiv Result was found[0m[32;1m[1;3mI apologize for the technical difficulties with the arXiv search. While I'm unable to retrieve specific papers at the moment, I can provide a general overview of quantum computing based on established knowledge:

Final Answer: 
Quantum computing is a rapidly evolving field that leverages quantum mechanical phenomena such as superposition and entanglement to perform computations. Key aspects include:

1. Quantum Bits (Qubits): Unlike classical bits that can be either 0 or 1, qubits can exist in multiple states simultaneously due to superpositio

## 4.3 Model Context Protocol (MCP)
MCP is a standardized protocol that enables AI assistants to securely connect to external data sources and tools through a unified interface.
Core Concept
Instead of managing multiple APIs, authentication methods, and data formats, MCP acts as a universal translator between AI systems and external resources. Think of it as a standardized plug-and-play system for AI integrations.
How MCP Works

MCP Servers - Data providers (like FRED, World Bank, or custom databases) expose their resources through MCP servers
AI Assistant Integration - AI systems like Claude connect to these MCP servers using the standard protocol
Unified Access - Applications can request data through the AI assistant's MCP connections seamlessly

Key Benefits
🔒 Security

Centralized authentication and secure data transmission
No need to manage individual API keys or security protocols

🔌 Standardization

Consistent interface across different data providers
Reduces integration complexity for developers

⚡ Real-time Access

Live data feeds from multiple sources
Automatic updates without manual intervention

🎯 Simplified Development

Focus on analysis rather than data integration
Eliminates API management overhead

Practical Applications

Economic Analysis: Access FRED, World Bank, IMF data through one interface
Research: Query multiple academic databases simultaneously
Business Intelligence: Connect to internal and external data sources
Financial Modeling: Real-time market data integration

The Value Proposition
MCP transforms data integration from a complex technical challenge into a simple connection process, allowing users to focus on analysis and insights rather than API management and data wrangling.