In [None]:
# 📦 INSTALLATION & SETUP
# IMPORTANT: Enable GPU in Colab (Runtime → Change runtime type → T4 GPU)

!pip install feedparser tiktoken sentence-transformers chromadb langchain langchain-community datasets transformers torch matplotlib pandas seaborn scikit-learn accelerate -q

# Import all libraries
import feedparser
import tiktoken
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import torch
from sklearn.metrics import classification_report, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("✅ All packages installed successfully!")
print(f"🔥 PyTorch version: {torch.__version__}")
print(f"🎯 CUDA available: {torch.cuda.is_available()}")

# Check GPU status
if not torch.cuda.is_available():
    print("\n" + "="*80)
    print("⚠️  WARNING: GPU NOT ENABLED - TRAINING WILL BE VERY SLOW!")
    print("="*80)
    print("\n🔧 HOW TO ENABLE T4 GPU:")
    print("   1. Click 'Runtime' in top menu")
    print("   2. Click 'Change runtime type'")
    print("   3. Select 'T4 GPU' from Hardware accelerator")
    print("   4. Click 'Save' and re-run this cell")
    print("\n⏱️  WITHOUT GPU: 2-4 hours | WITH GPU: 15-20 minutes")
    print("="*80)
else:
    print(f"\n✅ GPU ENABLED: {torch.cuda.get_device_name(0)}")
    print("🚀 Training will be fast!")

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.7/20.7 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [None]:
# TASK 1: NEWS RETRIEVAL SYSTEM FROM RPP RSS FEED
print("🔄 Fetching news from RPP Perú RSS feed...")

# Parse the RSS feed
rss_url = "https://rpp.pe/rss"
feed = feedparser.parse(rss_url)

# Extract 50 latest news items
news_data = []
for entry in feed.entries[:50]:
    news_item = {
        'title': entry.get('title', ''),
        'description': entry.get('summary', ''),
        'link': entry.get('link', ''),
        'published': entry.get('published', '')
    }
    news_data.append(news_item)

# Convert to DataFrame
df_rpp = pd.DataFrame(news_data)

print(f"✅ Successfully loaded {len(df_rpp)} news articles from RPP")
print(f"\nColumns: {list(df_rpp.columns)}")
print(f"\n📊 First 3 articles:")
df_rpp.head(3)

🔄 Fetching news from RPP Perú RSS feed...
✅ Successfully loaded 50 news articles from RPP

Columns: ['title', 'description', 'link', 'published']

📊 First 3 articles:


Unnamed: 0,title,description,link,published
0,Tabla de posiciones de Liga 1 Te Apuesto 2025 ...,Sigue el movimiento de las posiciones y cómo v...,https://rpp.pe/futbol/descentralizado/tabla-ac...,"Thu, 23 Oct 2025 22:10:26 -0500"
1,Cusco: niña de tres años sufre grave lesión en...,"La menor, tras ser sometida a la cirugía, fue ...",https://rpp.pe/peru/actualidad/cusco-nina-de-t...,"Thu, 23 Oct 2025 22:02:55 -0500"
2,¡Con sabor a 'tri'! Universitario venció 1-0 a...,Alex Valera le dio la victoria a Universitario...,https://rpp.pe/futbol/descentralizado/universi...,"Thu, 23 Oct 2025 22:00:08 -0500"


In [None]:
# 1️⃣ TOKENIZATION
print("🔤 Tokenizing sample article...")

# Initialize tiktoken encoder
encoding = tiktoken.get_encoding("cl100k_base")

# Take a sample article
sample_text = df_rpp.iloc[0]['title'] + " " + df_rpp.iloc[0]['description']

# Tokenize
tokens = encoding.encode(sample_text)
num_tokens = len(tokens)

print(f"\n📝 Sample Article:")
print(f"Title: {df_rpp.iloc[0]['title']}")
print(f"\nDescription: {df_rpp.iloc[0]['description'][:200]}...")
print(f"\n🔢 Token count: {num_tokens}")

# Check if chunking is needed
context_limit = 512
if num_tokens > context_limit:
    print(f"⚠️  Text exceeds {context_limit} tokens - chunking recommended")
else:
    print(f"✅ Text fits within {context_limit} token limit")

# Calculate tokens for all articles
df_rpp['full_text'] = df_rpp['title'] + " " + df_rpp['description']
df_rpp['num_tokens'] = df_rpp['full_text'].apply(lambda x: len(encoding.encode(x)))

print(f"\n📊 Token Statistics:")
print(f"Mean tokens: {df_rpp['num_tokens'].mean():.2f}")
print(f"Max tokens: {df_rpp['num_tokens'].max()}")
print(f"Min tokens: {df_rpp['num_tokens'].min()}")

🔤 Tokenizing sample article...

📝 Sample Article:
Title: Tabla de posiciones de Liga 1 Te Apuesto 2025 EN VIVO: Acumulado y resultados tras la victoria de Universitario

Description: Sigue el movimiento de las posiciones y cómo va la Tabla Acumulada de la Liga1 Te Apuesto 2025. Universitario derrotó a Sporting Cristal y quedó cerca de llevarse el Clausura....

🔢 Token count: 81
✅ Text fits within 512 token limit

📊 Token Statistics:
Mean tokens: 83.84
Max tokens: 129
Min tokens: 55


In [None]:
# 2️⃣ EMBEDDING
print("🧬 Generating embeddings using SentenceTransformers...")

# Load the embedding model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(model_name)

print(f"✅ Loaded model: {model_name}")
print(f"📐 Embedding dimension: {embedding_model.get_sentence_embedding_dimension()}")

# Generate embeddings for all articles
texts_to_embed = df_rpp['full_text'].tolist()
embeddings = embedding_model.encode(texts_to_embed, show_progress_bar=True)

print(f"\n✅ Generated {len(embeddings)} embeddings")
print(f"📊 Embedding shape: {embeddings.shape}")

# Add embeddings to dataframe
df_rpp['embedding'] = list(embeddings)

🧬 Generating embeddings using SentenceTransformers...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Loaded model: sentence-transformers/all-MiniLM-L6-v2
📐 Embedding dimension: 384


Batches:   0%|          | 0/2 [00:00<?, ?it/s]


✅ Generated 50 embeddings
📊 Embedding shape: (50, 384)


In [None]:
# 3️⃣ CREATE CHROMADB COLLECTION
print("💾 Creating ChromaDB collection...")

# Initialize ChromaDB client
chroma_client = chromadb.Client()

# Create or get collection
collection_name = "rpp_news_collection"

# Delete collection if exists
try:
    chroma_client.delete_collection(name=collection_name)
    print("🗑️  Deleted existing collection")
except:
    pass

# Create new collection
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=model_name
)

collection = chroma_client.create_collection(
    name=collection_name,
    embedding_function=sentence_transformer_ef
)

# Prepare documents for ChromaDB
documents = df_rpp['full_text'].tolist()
metadatas = [
    {
        'title': row['title'],
        'link': row['link'],
        'published': row['published']
    }
    for _, row in df_rpp.iterrows()
]
ids = [f"doc_{i}" for i in range(len(df_rpp))]

# Add documents to collection
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

print(f"✅ Collection '{collection_name}' created successfully")
print(f"📊 Total documents in collection: {collection.count()}")

💾 Creating ChromaDB collection...
✅ Collection 'rpp_news_collection' created successfully
📊 Total documents in collection: 50


In [None]:
# 4️⃣ QUERY RESULTS
print("🔍 Querying the collection...")

# Query with Spanish economic news
query = "Últimas noticias de economía"
print(f"\n📝 Query: '{query}'")

# Perform similarity search
results = collection.query(
    query_texts=[query],
    n_results=5
)

# Format results as DataFrame
retrieved_docs = []
for i in range(len(results['ids'][0])):
    doc = {
        'title': results['metadatas'][0][i]['title'],
        'description': results['documents'][0][i],
        'link': results['metadatas'][0][i]['link'],
        'date_published': results['metadatas'][0][i]['published']
    }
    retrieved_docs.append(doc)

df_results = pd.DataFrame(retrieved_docs)

print(f"\n✅ Found {len(df_results)} relevant articles\n")
print("="*80)
for idx, row in df_results.iterrows():
    print(f"\n🔹 Result {idx + 1}:")
    print(f"Title: {row['title']}")
    print(f"Published: {row['date_published']}")
    print(f"Link: {row['link']}")
    print("-"*80)

df_results

🔍 Querying the collection...

📝 Query: 'Últimas noticias de economía'

✅ Found 5 relevant articles


🔹 Result 1:
Title: MEF y el presidente Jerí revisan el Presupuesto Público 2026: “Cada sol invertido debe traducirse en bienestar”
Published: Thu, 23 Oct 2025 19:18:25 -0500
Link: https://rpp.pe/economia/economia/mef-y-el-presidente-jeri-revisan-el-presupuesto-publico-2026-cada-sol-invertido-debe-traducirse-en-bienestar-noticia-1660716
--------------------------------------------------------------------------------

🔹 Result 2:
Title: La Libertad: detienen a excabo del Ejército acusado de minería ilegal en Pataz
Published: Thu, 23 Oct 2025 21:55:58 -0500
Link: https://rpp.pe/peru/actualidad/la-libertad-detienen-a-excabo-del-ejercito-por-mineria-ilegal-en-pataz-noticia-1660733
--------------------------------------------------------------------------------

🔹 Result 3:
Title: Congreso elimina sesiones virtuales a partir de julio del 2026
Published: Thu, 23 Oct 2025 20:26:11 -0500
Link: h

Unnamed: 0,title,description,link,date_published
0,MEF y el presidente Jerí revisan el Presupuest...,MEF y el presidente Jerí revisan el Presupuest...,https://rpp.pe/economia/economia/mef-y-el-pres...,"Thu, 23 Oct 2025 19:18:25 -0500"
1,La Libertad: detienen a excabo del Ejército ac...,La Libertad: detienen a excabo del Ejército ac...,https://rpp.pe/peru/actualidad/la-libertad-det...,"Thu, 23 Oct 2025 21:55:58 -0500"
2,Congreso elimina sesiones virtuales a partir d...,Congreso elimina sesiones virtuales a partir d...,https://rpp.pe/politica/congreso/congreso-elim...,"Thu, 23 Oct 2025 20:26:11 -0500"
3,"Tras cambio en el reglamento del Congreso, arc...","Tras cambio en el reglamento del Congreso, arc...",https://rpp.pe/politica/elecciones/tras-cambio...,"Thu, 23 Oct 2025 19:49:41 -0500"
4,Fiscalía insiste en su pedido para que se orde...,Fiscalía insiste en su pedido para que se orde...,https://rpp.pe/politica/judiciales/dina-boluar...,"Thu, 23 Oct 2025 18:41:31 -0500"


In [None]:
# 5️⃣ LANGCHAIN ORCHESTRATION
print("🔗 Building end-to-end LangChain pipeline...")

from langchain_community.embeddings import HuggingFaceEmbeddings

# Step 1: Load RSS function
def load_rss_feed(url, max_items=50):
    feed = feedparser.parse(url)
    documents = []
    for entry in feed.entries[:max_items]:
        doc = Document(
            page_content=f"{entry.get('title', '')} {entry.get('summary', '')}",
            metadata={
                'title': entry.get('title', ''),
                'link': entry.get('link', ''),
                'published': entry.get('published', '')
            }
        )
        documents.append(doc)
    return documents

# Step 2: Initialize embedding function
embeddings = HuggingFaceEmbeddings(model_name=model_name)

# Step 3: Create vector store
print("\n1️⃣ Loading RSS feed...")
documents = load_rss_feed(rss_url, max_items=50)
print(f"   ✅ Loaded {len(documents)} documents")

print("\n2️⃣ Creating embeddings...")
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    collection_name="langchain_rpp_news"
)
print(f"   ✅ Vector store created")

# Step 4: Create retriever
print("\n3️⃣ Creating retriever...")
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
print(f"   ✅ Retriever ready")

# Step 5: Query
print("\n4️⃣ Querying...")
query = "noticias sobre tecnología e inteligencia artificial"
retrieved_docs = retriever.get_relevant_documents(query)

print(f"\n🔍 Query: '{query}'")
print(f"✅ Retrieved {len(retrieved_docs)} documents\n")
print("="*80)

for i, doc in enumerate(retrieved_docs):
    print(f"\n🔹 Document {i+1}:")
    print(f"Title: {doc.metadata['title']}")
    print(f"Published: {doc.metadata['published']}")
    print(f"Link: {doc.metadata['link']}")
    print("-"*80)

print("\n✅ LangChain pipeline completed successfully!")

🔗 Building end-to-end LangChain pipeline...

1️⃣ Loading RSS feed...
   ✅ Loaded 50 documents

2️⃣ Creating embeddings...
   ✅ Vector store created

3️⃣ Creating retriever...
   ✅ Retriever ready

4️⃣ Querying...

🔍 Query: 'noticias sobre tecnología e inteligencia artificial'
✅ Retrieved 5 documents


🔹 Document 1:
Title: Roberto Burneo: JNE definirá si se utilizará o no el voto digital en las Elecciones 2026 tras proceso de auditoría
Published: Thu, 23 Oct 2025 21:03:17 -0500
Link: https://rpp.pe/politica/elecciones/roberto-burneo-jne-definira-si-se-utilizara-o-no-el-voto-digital-en-las-elecciones-2026-tras-proceso-de-auditoria-noticia-1660669
--------------------------------------------------------------------------------

🔹 Document 2:
Title: MEF y el presidente Jerí revisan el Presupuesto Público 2026: “Cada sol invertido debe traducirse en bienestar”
Published: Thu, 23 Oct 2025 19:18:25 -0500
Link: https://rpp.pe/economia/economia/mef-y-el-presidente-jeri-revisan-el-presupuesto-pu