# 1. Reading PDFs

In [1]:
import os
path = "../../../data/examples/module_4/"
os.listdir(path)

['.DS_Store',
 'manifesto_english_speaking.parquet',
 'eu_press_releases_ghg.csv',
 'pisa',
 'manifesto_us.parquet']

In [2]:
path = "../../../data/examples/module_4/pisa/"
os.listdir(path)

['PISA 2015 Results (Volume I).pdf',
 'PISA 2022 Results (Volume I).pdf',
 'PISA 2006 Results (Volume I).pdf']

## 1.1 Read: Pypdf2

In [3]:
import os
from PyPDF2 import PdfReader

def read_pdf(file_path):
    result = {}
    with open(file_path, 'rb') as f:
        pdf = PdfReader(f)
        for page_num, page in enumerate(pdf.pages, start=1):
            result[f"Page {page_num}"] = page.extract_text()
    return result

# Usage
pdf_pisa_2015 = read_pdf(f"{path}/PISA 2015 Results (Volume I).pdf")

## 1.2 Read: pdfplumber

In [4]:
import pdfplumber
import os

def read_pdf(file_path):
    result = {}
    
    with pdfplumber.open(file_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            result[f"Page {page_num}"] = text if text else ""
    
    return result

# Usage
pdf_pisa_2015 = read_pdf(f"{path}/PISA 2015 Results (Volume I).pdf")

# 2. Rag

## 2.1 RAG: Basic word counts and weighted word counts (TF-IDF)

### 2.1.1 Generate understanding of document: Word counts

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline

def setup_rag(pdf_content, vectorizer_type='tfidf', max_words=500):
    # Extract the documents from the pdf_content dictionary
    documents = list(pdf_content.values())
    # Extract the page keys from the pdf_content dictionary
    page_keys = list(pdf_content.keys())

    # Check the type of vectorizer to use
    if vectorizer_type == 'tfidf':
        # Initialize a TfidfVectorizer with English stop words and 1000 max features
        vectorizer = TfidfVectorizer(stop_words='english', max_features=max_words)
    elif vectorizer_type == 'word_counts':
        # Initialize a CountVectorizer with English stop words and 1000 max features
        vectorizer = CountVectorizer(stop_words='english', max_features=max_words)
    else:
        # Raise an error if the vectorizer type is not 'tfidf' or 'word_counts'
        raise ValueError("vectorizer_type must be 'tfidf' or 'word_counts'")

    # Fit the vectorizer to the documents and transform them into vectors
    doc_vectors = vectorizer.fit_transform(documents)

    # Create a DataFrame from the document vectors
    df = pd.DataFrame(
        # Convert the document vectors to a numpy array
        doc_vectors.toarray(), 
        # Use the feature names (words) as column names
        columns=vectorizer.get_feature_names_out(), 
    )
    # Set the index of the DataFrame to the page numbers (starting from 1)
    df.index = [num+1 for num in df.index]
    # Set the name of the index to 'pdf_page_number'
    df.index.name = 'pdf_page_number'
    df.columns.name = 'word frequency from document'
    # Return the vectorizer, document vectors, documents, page keys, qa_pipeline, and DataFrame
    return vectorizer, doc_vectors, documents, page_keys, df

# Call the setup_rag function with the pdf_pisa_2015 content and 'word_counts' vectorizer type
vectorizer, doc_vectors, documents, page_keys, df = setup_rag(
    pdf_content = pdf_pisa_2015,
    vectorizer_type = 'word_counts',
    max_words = 500
)
# Display the first two rows of the DataFrame
df.head(2)

word frequency from document,00,000,01,02,03,04,05,06,07,08,...,values,variation,viet,vol,volume,web,work,year,years,zealand
pdf_page_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2.1.2 Query undersanding of document: Look for similar word counts

In [5]:
def retrieve_documents(query, vectorizer, doc_vectors, documents, page_keys, number_of_results=3):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, doc_vectors).flatten()
    top_doc_indices = similarities.argsort()[-number_of_results:][::-1]

    retrieved = []
    for idx in top_doc_indices:
        if similarities[idx] > 0:
            retrieved.append({
                'page': page_keys[idx],
                'content': documents[idx],
                'similarity': similarities[idx]
            })
    return pd.DataFrame(retrieved)

pd.set_option('max_colwidth', 400)


retrieve_documents(
    query = "How do economic conditions impact a country's pisa score?",
    vectorizer = vectorizer,
    doc_vectors = doc_vectors,
    documents = documents,
    page_keys = page_keys,
    number_of_results = 5
)

Unnamed: 0,page,content,similarity
0,Page 180,"MATHEMATICS PERFORMANCE AMONG 15‑YEAR‑OLDS\n5\n178\n© OECD 2016 PISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION When interpreting mean performance, only statistically significant differences among countries and economies should \nbe taken into account (see Box I.2.2 in Chapter 2). Figure I.5.1 shows each country’s/economy’s mean score and also indicates for which pairs of...",0.420583
1,Page 189,MATHEMATICS PERFORMANCE AMONG 15‑YEAR‑OLDS\n5\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016\n187\nFigure I.5.5 • Relationship between c Relationship between c hange in mathematics performance hange in mathematics performance \nand average PISA 2012 mathematics scoresand average PISA 2012 mathematics scores\nNotes: Score-point difference in mathematics betw...,0.41957
2,Page 216,"SOCIO‑ECONOMIC STATUS , STUDENT PERFORMANCE AND STUDENTS ’ ATTITUDES TOWARDS SCIENCE\n6\n214\n© OECD 2016 PISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION Box I.6.3. Assessing the skills of non ‑enrolled students in PISA for Development\nThe PISA for Development (PISA-D) initiative launched by the OECD and its partners aims to make PISA more \naccessible and relevant to...",0.409294
3,Page 12,TABLE OF CONTENTS10\n© OECD 2016 PISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION ANNEX B PISA 2015 DATA ................................................................................................................................................................................................... 319\nAnnex B1 Results for countries and economies ..........................,0.408403
4,Page 67,"SCIENCE PERFORMANCE AMONG 15‑YEAR ‑OLDS\n2\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016\n65\nFifteen-year-old students who sit the PISA test may be enrolled in one of two or more grade levels. Based on this \nvariation, past reports have estimated the average score-point difference across adjacent grades for countries in which a sizeable number of 15-year-...",0.407966


## 2.2 RAG: LLM word embeddings

### 2.2.1 Generate understanding of document: word embeddings. 


#### Word embeddings are more sophisticated than word counts because they take into account relationship between words! 



In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import warnings
warnings.filterwarnings('ignore')

def setup_rag(pdf_content, model_name='distilbert-base-uncased', max_length=512):
    # Extract documents and page keys
    documents = list(pdf_content.values())
    page_keys = list(pdf_content.keys())
    
    # Load DistilBERT model and tokenizer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    
    # Generate embeddings for all documents
    doc_embeddings = []
    
    print(f"Generating embeddings for {len(documents)} documents...")
    
    with torch.no_grad():
        for i, doc in enumerate(documents):
            # Tokenize document
            inputs = tokenizer(
                doc, 
                return_tensors='pt', 
                truncation=True, 
                padding=True, 
                max_length=max_length
            ).to(device)
            
            # Get embeddings
            outputs = model(**inputs)
            # Use mean pooling of last hidden states
            embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            doc_embeddings.append(embeddings.flatten())
            
            if (i + 1) % 100 == 0:
                print(f"Processed {i + 1}/{len(documents)} documents")
    
    # Convert to numpy array
    doc_embeddings = np.array(doc_embeddings)
    
    # Create DataFrame for visualization (showing embedding dimensions)
    df = pd.DataFrame(
        doc_embeddings,
        columns=[f'dim_{i}' for i in range(doc_embeddings.shape[1])],
        index=[num+1 for num in range(len(documents))]
    )
    df.index.name = 'pdf_page_number'
    df.columns.name = 'embedding_dimensions'
    
    print(f"Setup complete! Generated {doc_embeddings.shape[1]}-dimensional embeddings")
    
    return tokenizer, model, doc_embeddings, documents, page_keys, df

In [7]:
tokenizer, model, doc_embeddings, documents, page_keys, df = setup_rag(
    pdf_content=pdf_pisa_2015
)
df.head(4)

Generating embeddings for 494 documents...
Processed 100/494 documents
Processed 200/494 documents
Processed 300/494 documents
Processed 400/494 documents
Setup complete! Generated 768-dimensional embeddings


embedding_dimensions,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_758,dim_759,dim_760,dim_761,dim_762,dim_763,dim_764,dim_765,dim_766,dim_767
pdf_page_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.195254,0.007688,0.235429,0.082032,0.223767,0.007843,0.257681,0.326725,-0.150979,-0.20555,...,0.125929,0.00421,0.082262,-0.255817,0.087162,-0.391298,-0.108082,-0.165288,0.067531,0.135007
2,0.326434,0.015919,-0.135683,0.032392,-0.217147,-0.323166,0.175598,-0.138356,0.153253,-0.052729,...,0.002634,0.053211,-0.095615,-0.398742,0.063583,-0.431636,-0.040238,0.104947,-0.273906,-0.039879
3,-0.448043,-0.167676,0.263947,0.140525,0.168139,-0.003707,0.071323,0.256815,-0.12837,-0.246,...,0.312181,0.048495,0.083277,-0.508402,0.119995,-0.437686,-0.081176,-0.210847,0.069687,0.151142
4,-0.326302,0.186165,0.271942,-0.091883,0.412076,-0.306582,-0.079881,0.532862,-0.028512,-0.072974,...,0.084986,-0.019565,0.096858,-0.487536,-0.099612,-0.407267,-0.19206,-0.152539,-0.045045,0.124258


### 2.2.2 Query undersanding of document: Look for embeddings

In [8]:
def retrieve_documents(query, tokenizer, model, doc_embeddings, documents, page_keys, number_of_results=3):
    device = next(model.parameters()).device    
    # Generate query embedding
    with torch.no_grad():
        inputs = tokenizer(
            query, 
            return_tensors='pt', 
            truncation=True, 
            padding=True, 
            max_length=512
        ).to(device)
        outputs = model(**inputs)
        query_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten()
    
    # Calculate cosine similarities
    similarities = cosine_similarity([query_embedding], doc_embeddings).flatten()
    # Get top results
    top_doc_indices = similarities.argsort()[-number_of_results:][::-1]
    # Build results
    retrieved = []
    for idx in top_doc_indices:
        if similarities[idx] > 0:  # Only include positive similarities
            retrieved.append({
                'page': page_keys[idx],
                'content': documents[idx],
                'similarity': similarities[idx]
            })
    
    return pd.DataFrame(retrieved)


# Retrieve documents
pd.set_option('max_colwidth', 400)
df_results = retrieve_documents(
    query="How do economic conditions impact a country's pisa score?",
    tokenizer=tokenizer,
    model=model,
    doc_embeddings=doc_embeddings,
    documents=documents,
    page_keys=page_keys,
    number_of_results=5
)
df_results

Unnamed: 0,page,content,similarity
0,Page 203,"6\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016201\nSocio ‑economic status, \nstudent performance and students’ \nattitudes towards science\nThis chapter defines the dimensions of equity in education: inclusiveness \nand fairness. It first discusses 15-year-olds’ access to schooling in PISA-participating countries and economies, and then describes how the soci...",0.820918
1,Page 211,"SOCIO‑ECONOMIC STATUS , STUDENT PERFORMANCE AND STUDENTS ’ ATTITUDES TOWARDS SCIENCE\n6\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016\n209\nFigure I.6.3 shows correlations between these contextual factors and the indicators of equity in education presented above. \nAs expected, wealthier countries and economies, and those spending more on their education...",0.815804
2,Page 243,"7\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016241\nImmigrant background, \nstudent performance and students’ \nattitudes towards science\nThis chapter examines differences in performance and attitudes towards \nscience in PISA 2015 by students’ immigrant background. It discusses \nrecent trends in immigration in PISA-participating countries and economies, \na...",0.814725
3,Page 147,"4\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016145\nReading performance \namong 15 ‑year‑olds\nHow well can 15-year-old students understand, use, reflect on and engage \nwith written texts? This chapter compares countries’ and economies’ performance in reading in 2015 and analyses changes over the various PISA assessments. It highlights the differences between...",0.813263
4,Page 177,"5\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016175\nMathematics performance \namong 15 ‑year‑olds\nThis chapter compares countries’ and economies’ performance in \nmathematics in 2015 and analyses the changes in performance since 2003. Changes since the PISA 2012 assessment, when mathematics was most recently the major domain, are highlighted. The chapter also...",0.811971


### 2.2.3 Add a column that summarizes the content using hugging face summarization

In [9]:
import warnings
warnings.filterwarnings("ignore")

from transformers import pipeline
summarizer_bart = pipeline("summarization", model="facebook/bart-large-cnn")

df_results['llm_summary_bart'] = df_results['content'].apply(
        lambda text: summarizer_bart(text, #max_length=256, max_new_tokens = 256,
                                min_length=10, do_sample=False)[0]['summary_text']
    )
df_results

Device set to use mps:0


Unnamed: 0,page,content,similarity,llm_summary_bart
0,Page 203,"6\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016201\nSocio ‑economic status, \nstudent performance and students’ \nattitudes towards science\nThis chapter defines the dimensions of equity in education: inclusiveness \nand fairness. It first discusses 15-year-olds’ access to schooling in PISA-participating countries and economies, and then describes how the soci...",0.820918,This chapter defines the dimensions of equity in education: inclusiveness and fairness. It first discusses 15-year-olds’ access to schooling in PISA-participating countries and economies. It then describes how the socio-economic status of students and schools is related to student performance and attitudes towards science.
1,Page 211,"SOCIO‑ECONOMIC STATUS , STUDENT PERFORMANCE AND STUDENTS ’ ATTITUDES TOWARDS SCIENCE\n6\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016\n209\nFigure I.6.3 shows correlations between these contextual factors and the indicators of equity in education presented above. \nAs expected, wealthier countries and economies, and those spending more on their education...",0.815804,"PISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016. Access to schooling is a prerequisite for achieving inclusion and equity in education. According to UNESCO, 16.0% of the world’s youth of lower secondary school age were out of school."
2,Page 243,"7\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016241\nImmigrant background, \nstudent performance and students’ \nattitudes towards science\nThis chapter examines differences in performance and attitudes towards \nscience in PISA 2015 by students’ immigrant background. It discusses \nrecent trends in immigration in PISA-participating countries and economies, \na...",0.814725,This chapter examines differences in performance and attitudes towards science in PISA 2015 by students’ immigrant background. It discusses recent trends in immigration in countries and economies and highlights factors associated with low performance among immigrant students.
3,Page 147,"4\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016145\nReading performance \namong 15 ‑year‑olds\nHow well can 15-year-old students understand, use, reflect on and engage \nwith written texts? This chapter compares countries’ and economies’ performance in reading in 2015 and analyses changes over the various PISA assessments. It highlights the differences between...",0.813263,This chapter compares countries’ and economies’ performance in reading in 2015 and analyses changes over the various PISA assessments. The statistical data for Israel are supplied by and under the responsibility of the relevant Israeli authorities. The use of such data by the OECD is without prejudice.
4,Page 177,"5\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016175\nMathematics performance \namong 15 ‑year‑olds\nThis chapter compares countries’ and economies’ performance in \nmathematics in 2015 and analyses the changes in performance since 2003. Changes since the PISA 2012 assessment, when mathematics was most recently the major domain, are highlighted. The chapter also...",0.811971,"This chapter compares countries’ and economies’ performance in mathematics in 2015 and analyses the changes in performance since 2003. Changes since the PISA 2012 assessment, when mathematics was most recently the major domain, are highlighted."


In [10]:
summarizer_t5 = pipeline("summarization", model="t5-small")

df_results['llm_summary_t5'] = df_results['content'].apply(
        lambda text: summarizer_t5(text, #max_length=256, max_new_tokens = 256,
                                min_length=10, do_sample=False)[0]['summary_text']
    )
df_results

Device set to use mps:0
Your max_length is set to 200, but your input_length is only 189. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=94)
Token indices sequence length is longer than the specified maximum sequence length for this model (1097 > 512). Running this sequence through the model will result in indexing errors
Your max_length is set to 200, but your input_length is only 184. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=92)
Your max_length is set to 200, but your input_length is only 167. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=83)
Your max_length is set to 200, but your inp

Unnamed: 0,page,content,similarity,llm_summary_bart,llm_summary_t5
0,Page 203,"6\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016201\nSocio ‑economic status, \nstudent performance and students’ \nattitudes towards science\nThis chapter defines the dimensions of equity in education: inclusiveness \nand fairness. It first discusses 15-year-olds’ access to schooling in PISA-participating countries and economies, and then describes how the soci...",0.820918,This chapter defines the dimensions of equity in education: inclusiveness and fairness. It first discusses 15-year-olds’ access to schooling in PISA-participating countries and economies. It then describes how the socio-economic status of students and schools is related to student performance and attitudes towards science.,OECD 2016201 defines the dimensions of equity in education: inclusiveness and fairness . it first discusses 15-year-olds’ access to schooling in PISA-participating countries and economies . then describes how the socio -economic s tatus of students and schools is related to student performance .
1,Page 211,"SOCIO‑ECONOMIC STATUS , STUDENT PERFORMANCE AND STUDENTS ’ ATTITUDES TOWARDS SCIENCE\n6\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016\n209\nFigure I.6.3 shows correlations between these contextual factors and the indicators of equity in education presented above. \nAs expected, wealthier countries and economies, and those spending more on their education...",0.815804,"PISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016. Access to schooling is a prerequisite for achieving inclusion and equity in education. According to UNESCO, 16.0% of the world’s youth of lower secondary school age were out of school.","a student is classified as resilient if he or she is in the bottom quarter of the PISA index of economic, social and cultural status in the country/economies of assessment . in many countries, the goal of universal enrolment in lower and upper secondary education is far from becoming a reality . some countries with similar levels of economic development, investment in education and socio-econo..."
2,Page 243,"7\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016241\nImmigrant background, \nstudent performance and students’ \nattitudes towards science\nThis chapter examines differences in performance and attitudes towards \nscience in PISA 2015 by students’ immigrant background. It discusses \nrecent trends in immigration in PISA-participating countries and economies, \na...",0.814725,This chapter examines differences in performance and attitudes towards science in PISA 2015 by students’ immigrant background. It discusses recent trends in immigration in countries and economies and highlights factors associated with low performance among immigrant students.,"students’ immigrant background, student performance and students’ attitudes towards science in OECD 2016241 . it discusses recent trends in immigration in PISA-participating countries and economies, and highlights factors associated with low performance . the statistical data is supplied by and under the responsibility of the relevant Israeli authorities ."
3,Page 147,"4\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016145\nReading performance \namong 15 ‑year‑olds\nHow well can 15-year-old students understand, use, reflect on and engage \nwith written texts? This chapter compares countries’ and economies’ performance in reading in 2015 and analyses changes over the various PISA assessments. It highlights the differences between...",0.813263,This chapter compares countries’ and economies’ performance in reading in 2015 and analyses changes over the various PISA assessments. The statistical data for Israel are supplied by and under the responsibility of the relevant Israeli authorities. The use of such data by the OECD is without prejudice.,"OECD 2016145 Reading performance among 15 yearolds How well can 15-year-old students understand, use, reflect on and engage with written texts? this chapter compares countries’ and economies’ performance in reading in 2015 . it highlights the differences between girls’ and boys’ performance ."
4,Page 177,"5\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016175\nMathematics performance \namong 15 ‑year‑olds\nThis chapter compares countries’ and economies’ performance in \nmathematics in 2015 and analyses the changes in performance since 2003. Changes since the PISA 2012 assessment, when mathematics was most recently the major domain, are highlighted. The chapter also...",0.811971,"This chapter compares countries’ and economies’ performance in mathematics in 2015 and analyses the changes in performance since 2003. Changes since the PISA 2012 assessment, when mathematics was most recently the major domain, are highlighted.",OECD 2016175 Mathematics performance among 15 yearolds This chapter compares countries' and economies' performance in mathematics in 2015 . changes in performance since 2003 are highlighted .


In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def summarize_text(text, model_name="t5-small", min_length=10):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    input_ids = tokenizer("summarize: " + text, return_tensors="pt").input_ids
    summary_ids = model.generate(input_ids, min_length=min_length, do_sample=False)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

df_results['llm_summary_t5'] = df_results['content'].apply(summarize_text)
df_results

Token indices sequence length is longer than the specified maximum sequence length for this model (1097 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,page,content,similarity,llm_summary_bart,llm_summary_t5
0,Page 203,"6\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016201\nSocio ‑economic status, \nstudent performance and students’ \nattitudes towards science\nThis chapter defines the dimensions of equity in education: inclusiveness \nand fairness. It first discusses 15-year-olds’ access to schooling in PISA-participating countries and economies, and then describes how the soci...",0.820918,This chapter defines the dimensions of equity in education: inclusiveness and fairness. It first discusses 15-year-olds’ access to schooling in PISA-participating countries and economies. It then describes how the socio-economic status of students and schools is related to student performance and attitudes towards science.,OECD 2016201 defines the dimensions of equity in education: inclusiveness and fairness.
1,Page 211,"SOCIO‑ECONOMIC STATUS , STUDENT PERFORMANCE AND STUDENTS ’ ATTITUDES TOWARDS SCIENCE\n6\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016\n209\nFigure I.6.3 shows correlations between these contextual factors and the indicators of equity in education presented above. \nAs expected, wealthier countries and economies, and those spending more on their education...",0.815804,"PISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016. Access to schooling is a prerequisite for achieving inclusion and equity in education. According to UNESCO, 16.0% of the world’s youth of lower secondary school age were out of school.",OECD 2016 209 shows correlations between these contextual factors and indicators of equity in education
2,Page 243,"7\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016241\nImmigrant background, \nstudent performance and students’ \nattitudes towards science\nThis chapter examines differences in performance and attitudes towards \nscience in PISA 2015 by students’ immigrant background. It discusses \nrecent trends in immigration in PISA-participating countries and economies, \na...",0.814725,This chapter examines differences in performance and attitudes towards science in PISA 2015 by students’ immigrant background. It discusses recent trends in immigration in countries and economies and highlights factors associated with low performance among immigrant students.,OECD 2016241 focuses on student performance and attitudes towards science. a note
3,Page 147,"4\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016145\nReading performance \namong 15 ‑year‑olds\nHow well can 15-year-old students understand, use, reflect on and engage \nwith written texts? This chapter compares countries’ and economies’ performance in reading in 2015 and analyses changes over the various PISA assessments. It highlights the differences between...",0.813263,This chapter compares countries’ and economies’ performance in reading in 2015 and analyses changes over the various PISA assessments. The statistical data for Israel are supplied by and under the responsibility of the relevant Israeli authorities. The use of such data by the OECD is without prejudice.,OECD 2016145 Reading performance among 15 yearolds. a note
4,Page 177,"5\nPISA 2015 RESULTS (VOLUME I): EXCELLENCE AND EQUITY IN EDUCATION © OECD 2016175\nMathematics performance \namong 15 ‑year‑olds\nThis chapter compares countries’ and economies’ performance in \nmathematics in 2015 and analyses the changes in performance since 2003. Changes since the PISA 2012 assessment, when mathematics was most recently the major domain, are highlighted. The chapter also...",0.811971,"This chapter compares countries’ and economies’ performance in mathematics in 2015 and analyses the changes in performance since 2003. Changes since the PISA 2012 assessment, when mathematics was most recently the major domain, are highlighted.",OECD 2016175 Mathematics performance among 15 yearolds. changes in


# 3. Generative AI

## 3.1 On your local machine: THIS MODEL IS SO LARGE IT MAY CAUSE YOUR COMPUTER TO CRASH

In [None]:
%%timeit
from transformers import pipeline

pipe = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", device=-1)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

## 3.2 Using GPU in the cloud

In [None]:
from huggingface_hub import InferenceClient

client = InferenceClient(
    provider="together",
    api_key="YOUR_HF_TOKEN",
)

completion = client.chat.completions.create(
    model="mistralai/Mistral-7B-Instruct-v0.3",
    messages=[
        {
            "role": "user",
            "content": "What is the capital of France?"
        }
    ],
)

print(completion.choices[0].message)

## 3.3 Using closed source model: Claude

- https://docs.anthropic.com/en/docs/about-claude/models/overview

In [1]:
import anthropic
import os
from dotenv import load_dotenv

load_dotenv()
client = anthropic.Anthropic(api_key=os.getenv("CLAUDE_API"))

def llm_claude(input_text, claude_model="claude-3-haiku"):
    messages = [
        {"role": "user", "content": input_text}
    ]
    chat_completion = client.messages.create(
        model=claude_model,
        max_tokens=100,
        messages=messages
    )
    return chat_completion

# Fixed function call - using the correct function name
claude_response = llm_claude(
    input_text="Tell me a fun fact about Japan.", 
    claude_model="claude-3-5-sonnet-20241022"
)

# Access the response content
print(claude_response.content[0].text)

Here's a fun fact: In Japan, there's a unique forest called Aokigahara near Mount Fuji where compasses don't work properly! This is because the forest grew on top of volcanic rock that contains high levels of magnetic iron, which interferes with magnetic compass readings. The forest is often called the "Sea of Trees" (Jukai) because of its dense foliage.


# 4. Frontier

## 4.1 Agents

In [5]:
import os
from langchain_anthropic import ChatAnthropic
from langchain_community.tools import ArxivQueryRun
from langchain_community.utilities import ArxivAPIWrapper
from langchain.agents import create_react_agent, AgentExecutor
from langchain import hub
from langchain.callbacks import get_openai_callback
from dotenv import load_dotenv

load_dotenv()

def search_with_claude_tracked(query):
    """
    Enhanced arXiv search using Claude with token tracking
    """
    api_key = os.getenv("CLAUDE_API")
    
    # Initialize Claude LLM
    llm = ChatAnthropic(
        model="claude-3-5-sonnet-20241022",
        anthropic_api_key=api_key,
        temperature=0.2,
        max_tokens=1000
    )
    
    # Initialize arXiv tool
    arxiv_tool = ArxivQueryRun(
        api_wrapper=ArxivAPIWrapper(
            top_k_results=3, 
            doc_content_chars_max=800
        )
    )
    
    # Get the ReAct prompt template
    prompt = hub.pull("hwchase17/react")
    
    # Create agent
    agent = create_react_agent(llm, [arxiv_tool], prompt)
    
    # Create agent executor
    agent_executor = AgentExecutor(
        agent=agent,
        tools=[arxiv_tool],
        verbose=True,
        handle_parsing_errors=True
    )
    
    # Track tokens using callback (note: this works better with OpenAI models)
    try:
        with get_openai_callback() as cb:
            result = agent_executor.invoke({
                "input": f"Search arXiv for papers about: {query}. Provide a summary of the most relevant findings."
            })
            
            # Print token usage info
            print(f"\nToken Usage:")
            print(f"Total Tokens: {cb.total_tokens}")
            print(f"Prompt Tokens: {cb.prompt_tokens}")
            print(f"Completion Tokens: {cb.completion_tokens}")
            print(f"Total Cost (USD): ${cb.total_cost}")
            
        return result["output"]
    except Exception as e:
        return f"Error occurred: {str(e)}"

# Run the search
result = search_with_claude_tracked("quantum computing")
print("\nSearch Results:")
print("=" * 50)
print(result)





[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mLet me search arXiv for papers about quantum computing.

Action: arxiv
Action Input: quantum computing
[0m


KeyboardInterrupt



## 4.2 MCP

Key MCP Benefits for Economists:
🔌 Unified Data Access: Instead of managing multiple API keys and different data formats, MCP provides a standardized way to connect to:

FRED (Federal Reserve Economic Data)
World Bank APIs
IMF databases
Bureau of Labor Statistics
Custom institutional datasets

🔒 Security: MCP handles authentication and secure data transmission, so economists don't need to worry about API key management or data security protocols.
⚡ Real-time Analysis: The app can automatically fetch the latest economic indicators and update models in real-time through MCP connections.
How the Interaction Works:

MCP Server Setup: Economic institutions expose their data through MCP servers
AI Assistant Integration: The AI assistant (like Claude) connects to these MCP servers
Analysis App: The Streamlit/Gradio app requests data through the AI assistant's MCP connections
Seamless Flow: Economists get unified access to multiple data sources without managing individual APIs

Real-world Use Cases:

Policy Analysis: Fetch real-time inflation, employment, and GDP data for policy impact modeling
Market Research: Compare economic indicators across countries using standardized MCP connections
Academic Research: Access multiple datasets for econometric analysis without API complexity
Risk Assessment: Real-time monitoring of economic indicators for financial risk models

The MCP protocol essentially acts as a universal translator between AI assistants and economic data sources, making it much easier for economists to build sophisticated analysis tools without getting bogged down in data integration challenges.

- https://www.youtube.com/watch?v=p4q6LI-2yZ8&ab_channel=HuggingFace

In [None]:
# MCP-Enabled Economics Analysis App with Streamlit
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta
import numpy as np

# Simulated MCP Server Integration
class MCPEconomicsServer:
    """
    Simulated MCP server that would connect to real economic data sources
    In practice, this would use the MCP protocol to connect to:
    - FRED (Federal Reserve Economic Data)
    - World Bank API
    - IMF Data
    - Bureau of Labor Statistics
    - Custom institutional databases
    """
    
    def __init__(self):
        self.available_resources = {
            "fred": ["GDP", "unemployment_rate", "inflation_rate", "interest_rates"],
            "world_bank": ["gdp_per_capita", "population", "trade_balance"],
            "bls": ["employment_data", "wage_data", "productivity"]
        }
    
    def get_resource_list(self):
        """List all available economic indicators via MCP"""
        return self.available_resources
    
    def fetch_economic_data(self, indicator, country="US", start_date="2020-01-01", end_date="2024-01-01"):
        """
        Simulate fetching data through MCP protocol
        In reality, this would make secure API calls through MCP servers
        """
        # Simulated data generation for demo
        date_range = pd.date_range(start=start_date, end=end_date, freq='M')
        
        # Generate realistic economic data patterns
        if indicator == "GDP":
            # GDP with growth trend and COVID dip
            base_value = 21000  # Billion USD
            trend = np.linspace(0, 1000, len(date_range))
            covid_impact = np.where((date_range >= '2020-03-01') & (date_range <= '2020-12-01'), -800, 0)
            noise = np.random.normal(0, 100, len(date_range))
            values = base_value + trend + covid_impact + noise
            
        elif indicator == "unemployment_rate":
            # Unemployment with COVID spike
            base_rate = 4.0
            covid_spike = np.where((date_range >= '2020-03-01') & (date_range <= '2021-06-01'), 
                                 np.linspace(0, 10, sum((date_range >= '2020-03-01') & (date_range <= '2021-06-01'))), 0)
            recovery = np.where(date_range > '2021-06-01', 
                              np.linspace(14, 0, sum(date_range > '2021-06-01')), 0)
            noise = np.random.normal(0, 0.2, len(date_range))
            values = base_rate + covid_spike + recovery + noise
            values = np.clip(values, 0, 15)  # Keep realistic bounds
            
        elif indicator == "inflation_rate":
            # Inflation with recent surge
            base_rate = 2.0
            recent_surge = np.where(date_range >= '2021-01-01', 
                                  np.linspace(0, 6, sum(date_range >= '2021-01-01')), 0)
            noise = np.random.normal(0, 0.3, len(date_range))
            values = base_rate + recent_surge + noise
            values = np.clip(values, -1, 10)
            
        else:
            # Generic indicator
            values = np.random.normal(100, 10, len(date_range))
        
        return pd.DataFrame({
            'date': date_range,
            'value': values,
            'indicator': indicator,
            'country': country
        })

# Initialize MCP connection
@st.cache_resource
def initialize_mcp():
    """Initialize MCP server connection"""
    return MCPEconomicsServer()

def main():
    st.set_page_config(page_title="MCP Economics Analysis", layout="wide")
    
    st.title("📊 MCP-Enabled Economic Analysis Dashboard")
    st.markdown("""
    This app demonstrates how economists can use **Model Context Protocol (MCP)** to seamlessly access 
    economic data from multiple sources and perform real-time analysis.
    """)
    
    # Initialize MCP server
    mcp_server = initialize_mcp()
    
    # Sidebar for MCP resource selection
    st.sidebar.header("🔌 MCP Data Sources")
    
    # Display available MCP resources
    resources = mcp_server.get_resource_list()
    st.sidebar.write("**Connected MCP Servers:**")
    for server, indicators in resources.items():
        st.sidebar.write(f"• {server.upper()}: {len(indicators)} indicators")
    
    # Data selection interface
    st.sidebar.header("📈 Analysis Configuration")
    
    # Flatten indicator list for selection
    all_indicators = []
    for server_indicators in resources.values():
        all_indicators.extend(server_indicators)
    
    selected_indicators = st.sidebar.multiselect(
        "Select Economic Indicators",
        options=all_indicators,
        default=["GDP", "unemployment_rate", "inflation_rate"]
    )
    
    country = st.sidebar.selectbox("Country", ["US", "UK", "Germany", "Japan", "China"])
    
    # Date range selection
    col1, col2 = st.sidebar.columns(2)
    with col1:
        start_date = st.date_input("Start Date", datetime(2020, 1, 1))
    with col2:
        end_date = st.date_input("End Date", datetime(2024, 1, 1))
    
    if st.sidebar.button("🔄 Fetch Data via MCP", type="primary"):
        if selected_indicators:
            # Main analysis area
            st.header("📊 Economic Indicators Analysis")
            
            # Fetch data through MCP
            all_data = []
            progress_bar = st.progress(0)
            
            for i, indicator in enumerate(selected_indicators):
                with st.spinner(f"Fetching {indicator} data via MCP..."):
                    data = mcp_server.fetch_economic_data(
                        indicator=indicator,
                        country=country,
                        start_date=start_date.strftime("%Y-%m-%d"),
                        end_date=end_date.strftime("%Y-%m-%d")
                    )
                    all_data.append(data)
                progress_bar.progress((i + 1) / len(selected_indicators))
            
            # Combine all data
            combined_data = pd.concat(all_data, ignore_index=True)
            
            # Create visualizations
            tab1, tab2, tab3, tab4 = st.tabs(["📈 Time Series", "📊 Correlation", "🔍 Statistics", "📋 Raw Data"])
            
            with tab1:
                st.subheader("Time Series Analysis")
                
                # Individual plots for each indicator
                for indicator in selected_indicators:
                    indicator_data = combined_data[combined_data['indicator'] == indicator]
                    
                    fig = px.line(
                        indicator_data, 
                        x='date', 
                        y='value',
                        title=f"{indicator.replace('_', ' ').title()} - {country}",
                        labels={'value': 'Value', 'date': 'Date'}
                    )
                    fig.update_layout(height=400)
                    st.plotly_chart(fig, use_container_width=True)
            
            with tab2:
                st.subheader("Correlation Analysis")
                
                if len(selected_indicators) > 1:
                    # Pivot data for correlation analysis
                    pivot_data = combined_data.pivot(index='date', columns='indicator', values='value')
                    correlation_matrix = pivot_data.corr()
                    
                    # Heatmap
                    fig = px.imshow(
                        correlation_matrix,
                        title="Indicator Correlation Matrix",
                        color_continuous_scale='RdBu_r',
                        aspect='auto'
                    )
                    st.plotly_chart(fig, use_container_width=True)
                    
                    # Correlation insights
                    st.write("**Key Correlations:**")
                    for i, ind1 in enumerate(correlation_matrix.columns):
                        for j, ind2 in enumerate(correlation_matrix.columns):
                            if i < j:  # Avoid duplicates
                                corr_val = correlation_matrix.loc[ind1, ind2]
                                if abs(corr_val) > 0.5:
                                    st.write(f"• {ind1} ↔ {ind2}: {corr_val:.3f}")
                else:
                    st.info("Select multiple indicators to see correlations")
            
            with tab3:
                st.subheader("Statistical Summary")
                
                # Statistics for each indicator
                for indicator in selected_indicators:
                    indicator_data = combined_data[combined_data['indicator'] == indicator]
                    
                    col1, col2, col3, col4 = st.columns(4)
                    with col1:
                        st.metric("Mean", f"{indicator_data['value'].mean():.2f}")
                    with col2:
                        st.metric("Std Dev", f"{indicator_data['value'].std():.2f}")
                    with col3:
                        st.metric("Min", f"{indicator_data['value'].min():.2f}")
                    with col4:
                        st.metric("Max", f"{indicator_data['value'].max():.2f}")
                    
                    # Distribution plot
                    fig = px.histogram(
                        indicator_data, 
                        x='value', 
                        title=f"{indicator} Distribution",
                        nbins=20
                    )
                    st.plotly_chart(fig, use_container_width=True)
            
            with tab4:
                st.subheader("Raw Data from MCP Sources")
                st.dataframe(combined_data, use_container_width=True)
                
                # Download functionality
                csv = combined_data.to_csv(index=False)
                st.download_button(
                    label="📥 Download Data as CSV",
                    data=csv,
                    file_name=f"economic_data_{country}_{datetime.now().strftime('%Y%m%d')}.csv",
                    mime="text/csv"
                )
        else:
            st.warning("Please select at least one economic indicator")
    
    # MCP Information Panel
    with st.expander("ℹ️ About MCP Integration"):
        st.markdown("""
        **Model Context Protocol (MCP) Benefits for Economists:**
        
        🔒 **Security**: Secure, authenticated access to economic databases
        
        🔌 **Standardization**: Unified interface across different data providers
        
        🚀 **Real-time**: Live data feeds from central banks, statistical offices
        
        🔄 **Automation**: Automated data updates and model refreshing
        
        📊 **Integration**: Seamless connection between AI analysis and data sources
        
        **Connected MCP Servers in this example:**
        - FRED (Federal Reserve Economic Data)
        - World Bank Open Data
        - Bureau of Labor Statistics
        - Custom institutional databases
        """)

if __name__ == "__main__":
    main()