In [None]:
!pip install langchain langchain-groq langchain-community langchain-chroma
!pip install sentence-transformers chromadb beautifulsoup4 requests
!pip install groq python-dateutil

In [1]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
import datetime
import hashlib
import re



In [9]:
from google.colab import userdata
import os
os.environ['GROQ_API_KEY'] = userdata.get('GROQ_API_KEY')

In [2]:
loader = WebBaseLoader("https://en.wikipedia.org/wiki/Artificial_intelligence")
docs = loader.load()

In [3]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

In [4]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(chunks, embeddings)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [5]:
query = "What are the main applications of artificial intelligence?"
initial_results = vectorstore.similarity_search_with_score(query, k=20)

print(f"Initial results: {len(initial_results)}")

Initial results: 20


In [6]:
def remove_duplicates(results):
    seen_hashes = set()
    filtered = []

    for doc, score in results:
        content_hash = hashlib.md5(doc.page_content.encode()).hexdigest()
        if content_hash not in seen_hashes:
            seen_hashes.add(content_hash)
            filtered.append((doc, score))

    return filtered

deduplicated = remove_duplicates(initial_results)
print(f"After deduplication: {len(deduplicated)}")

After deduplication: 20


In [7]:
def filter_by_relevance(results, threshold=0.5):
    return [(doc, score) for doc, score in results if score <= threshold]

relevance_filtered = filter_by_relevance(deduplicated, threshold=0.8)
print(f"After relevance filtering: {len(relevance_filtered)}")

# 3. CONTENT QUALITY SCORING
def quality_score(text):
    """Simple content quality scoring based on heuristics"""
    score = 0

    # Length scoring (moderate length preferred)
    length = len(text.split())
    if 50 <= length <= 500:
        score += 2
    elif length > 20:
        score += 1

    # Sentence structure scoring
    sentences = text.split('.')
    if len(sentences) >= 2:
        score += 1

    # Information density (avoid repetitive content)
    unique_words = len(set(text.lower().split()))
    total_words = len(text.split())
    if total_words > 0 and unique_words / total_words > 0.5:
        score += 1

    return score

After relevance filtering: 4


In [8]:
quality_filtered = []
for doc, score in relevance_filtered:
    quality = quality_score(doc.page_content)
    if quality >= 2:  # Minimum quality threshold
        quality_filtered.append((doc, score, quality))

print(f"After quality filtering: {len(quality_filtered)}")

After quality filtering: 4


In [10]:
from langchain_groq import ChatGroq

llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0)

def llm_quality_filter(doc_score_pairs, max_docs=10):
    """Use LLM to assess content quality and relevance"""
    filtered_results = []

    for doc, score in doc_score_pairs[:max_docs]:  # Limit LLM calls
        # Create quality assessment prompt
        prompt = f"""
        Assess this text for quality and relevance to the query: "{query}"

        Text: {doc.page_content[:500]}

        Rate from 1-5 where:
        5 = Highly relevant and informative
        4 = Relevant with good information
        3 = Somewhat relevant
        2 = Low relevance
        1 = Not relevant

        Respond with only the number (1-5):
        """

        try:
            response = llm.invoke(prompt)
            quality_score = int(response.content.strip())

            if quality_score >= 3:
                filtered_results.append((doc, score, quality_score))
        except:
            filtered_results.append((doc, score, 3))

    return filtered_results

llm_filtered = llm_quality_filter(relevance_filtered[:10])
print(f"After LLM quality filtering: {len(llm_filtered)}")

After LLM quality filtering: 3


In [11]:
def complete_filtering_pipeline(query, vectorstore, top_k=20):
    """Complete filtering pipeline in one function"""

    # Step 1: Initial retrieval
    results = vectorstore.similarity_search_with_score(query, k=top_k)

    # Step 2: Remove duplicates
    seen_hashes = set()
    deduped = []
    for doc, score in results:
        content_hash = hashlib.md5(doc.page_content.encode()).hexdigest()
        if content_hash not in seen_hashes:
            seen_hashes.add(content_hash)
            deduped.append((doc, score))

    # Step 3: Relevance threshold
    relevance_filtered = [(doc, score) for doc, score in deduped if score <= 0.8]

    # Step 4: Content quality heuristics
    quality_filtered = []
    for doc, score in relevance_filtered:
        length = len(doc.page_content.split())
        unique_ratio = len(set(doc.page_content.lower().split())) / max(len(doc.page_content.split()), 1)

        if 20 <= length <= 1000 and unique_ratio > 0.4:
            quality_filtered.append((doc, score))

    # Step 5: Sort by relevance score (lower is better for similarity)
    final_results = sorted(quality_filtered, key=lambda x: x[1])[:5]

    return final_results

# Run complete pipeline
filtered_results = complete_filtering_pipeline(query, vectorstore)

print("=== FINAL FILTERED RESULTS ===")
for i, (doc, score) in enumerate(filtered_results):
    print(f"\n--- Result {i+1} (Relevance: {score:.3f}) ---")
    print(doc.page_content[:200] + "...")

=== FINAL FILTERED RESULTS ===

--- Result 1 (Relevance: 0.688) ---
High-profile applications of AI include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); virtual assistants (e.g., Google Assistant, Si...

--- Result 2 (Relevance: 0.704) ---
Applications
Main article: Applications of artificial intelligenceAI and machine learning technology is used in most of the essential applications of the 2020s, including: search engines (such as Goog...

--- Result 3 (Relevance: 0.732) ---
General intelligence
A machine with artificial general intelligence would be able to solve a wide variety of problems with breadth and versatility similar to human intelligence.[68]

Techniques
AI res...

--- Result 4 (Relevance: 0.751) ---
Various subfields of AI research are centered around particular goals and the use of particular tools. The traditional goals of AI research include learning, reasoning, knowledge representation, plann...


In [12]:
def analyze_filtering_impact(original_results, filtered_results):
    print(f"📊 FILTERING ANALYSIS")
    print(f"Original results: {len(original_results)}")
    print(f"Final results: {len(filtered_results)}")
    print(f"Reduction: {((len(original_results) - len(filtered_results)) / len(original_results) * 100):.1f}%")

    # Score distribution analysis
    original_scores = [score for _, score in original_results]
    filtered_scores = [score for _, score in filtered_results]

    print(f"Original avg score: {sum(original_scores)/len(original_scores):.3f}")
    print(f"Filtered avg score: {sum(filtered_scores)/len(filtered_scores):.3f}")

analyze_filtering_impact(initial_results, filtered_results)

📊 FILTERING ANALYSIS
Original results: 20
Final results: 4
Reduction: 80.0%
Original avg score: 0.836
Filtered avg score: 0.719
