In [1]:
!pip install google-generativeai
!pip install youtube-transcript-api
!pip install langchain
!pip install langchain-google-genai
!pip install transformers
!pip install serpapi
!pip install dateparser
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install -U langchain-community
!pip install duckduckgo-search
!pip install google-search-results
!pip install gradio
!pip install wikipedia
!pip install tavily-python


Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.0.3-py3-none-any.whl.metadata (23 kB)
Downloading youtube_transcript_api-1.0.3-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-1.0.3
Collecting packaging<25,>=23.2 (from langchain-core<1.0.0,>=0.3.49->langchain)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Downloading packaging-24.2-py3-none-any.whl (65 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: packaging
  Attempting uninstall: packaging
    Found existing installation: packaging 25.0
    Uninstalling packaging-25.0:
      Successfully uninstalled packaging-25.0
[31mERROR: pip's dependency resolver does not current

# DD Go and Wikipedia

In [2]:
import os

In [7]:
os.environ["TAVILY_API_KEY"] = ""

In [8]:
#FINALLLLL

import os
import dateparser
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import pipeline
from langchain.tools import Tool
from langchain.agents import initialize_agent, AgentType
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.tools import DuckDuckGoSearchRun
from langchain.utilities import WikipediaAPIWrapper
from langchain_community.tools.tavily_search import TavilySearchResults
import spacy
import time
import json
import gradio as gr
import re

# Load models
nlp = spacy.load("en_core_web_sm")
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")

def get_youtube_transcript(video_url):
    video_id = video_url.split("v=")[-1].split("&")[0]
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    text = " ".join([item['text'] for item in transcript])
    return text

def extract_factual_claims(text, window_size=1):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) >= 5]
    extracted_claims = []

    for idx, sentence_text in enumerate(sentences):
        entities = ner_pipeline(sentence_text)

        if entities:
            start_idx = max(0, idx - window_size)
            end_idx = min(len(sentences), idx + window_size + 1)
            context_sentences = sentences[start_idx:end_idx]
            context_text = " ".join(context_sentences)

            claim_info = {
                "sentence": sentence_text,
                "context": context_text,
                "entities": []
            }
            for ent in entities:
                entity_data = {
                    "text": ent['word'],
                    "type": ent['entity_group'],
                    "score": ent['score']
                }
                if ent['entity_group'] == "DATE":
                    parsed_date = dateparser.parse(ent['word'])
                    entity_data["parsed_date"] = str(parsed_date) if parsed_date else None
                claim_info["entities"].append(entity_data)

            extracted_claims.append(claim_info)

    return extracted_claims

def extract_json_from_output(result):
    output_text = result['output'] if isinstance(result, dict) and 'output' in result else str(result)

    # Try to extract JSON block (inside ```json ... ``` or just curly braces)
    match = re.search(r'```json\s*(\{.*?\})\s*```', output_text, re.DOTALL)

    if not match:
        # Fallback: try to find any JSON-looking block without the backticks
        match = re.search(r'(\{.*?\})', output_text, re.DOTALL)

    if match:
        json_text = match.group(1)
        try:
            data = json.loads(json_text)
            return data
        except json.JSONDecodeError as e:
            return {"verdict": "Error", "explanation": f"Failed to parse JSON: {e}", "sources": []}
    else:
        return {"verdict": "Error", "explanation": "No JSON found in output.", "sources": []}

def process_video(video_url, api_key):
    if not api_key:
        yield "<p>Please provide your Gemini API key.</p>", "", "⚠️ No API key provided."
        return

    try:
        
        
        os.environ["GOOGLE_API_KEY"] = api_key
        
        llm = ChatGoogleGenerativeAI(
            model="models/gemini-2.0-flash-lite-preview", 
            temperature=0.2,
            google_api_key=api_key
        )
        


        tavily_tool = TavilySearchResults(api_key=os.getenv("TAVILY_API_KEY"))

        ddg_tool = DuckDuckGoSearchRun()
        
        wiki_tool = WikipediaAPIWrapper()
        
        # Resilient fallback search function
        def search_with_fallback(query):
            try:
                return ddg_tool.run(query)
            except Exception as e:
                print(f"DuckDuckGo failed: {e}. Trying Wikipedia...")
                try:
                    return wiki_tool.run(query)
                except Exception as e:
                    print(f"Wikipedia failed: {e}. Trying Tavily...")
                    try:
                        return tavily_tool.run(query)
                    except Exception as e:
                        return f"All search tools failed: {e}"
        
        # Resilient search tool wrapper
        ResilientSearchTool = Tool(
            name="Resilient Search",
            func=search_with_fallback,
            description="Searches using DuckDuckGo, falls back to Wikipedia and Tavily if needed."
        )
        
        # Use only the resilient search tool
        tools = [ResilientSearchTool]

        
        agent = initialize_agent(
            tools,
            llm,
            agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
            verbose=False,
            handle_parsing_errors=True
        )
        
        # Get video ID and transcript
        video_id = video_url.split("v=")[-1].split("&")[0]
        transcript = get_youtube_transcript(video_url)
        
        # Create embed HTML
        embed_html = f"""
        <iframe width="560" height="315"
        src="https://www.youtube.com/embed/{video_id}"
        title="YouTube video player" frameborder="0"
        allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
        allowfullscreen></iframe>
        """
        
        yield embed_html, transcript, "🔍 Extracting factual claims..."
    except Exception as e:
        yield f"<p>Error: {str(e)}</p>", "", ""
        return

    # Extract claims
    facts = extract_factual_claims(transcript)
    if not facts:
        yield embed_html, transcript, "✅ No factual claims detected."
        return

    # Define function that uses the already initialized agent
    def fact_check_with_agent(claim, context):
        prompt = f"""
        You are a fact-checker.
        You will also use a "Resilient Search" tool to find credible URLs via Duck Duck Go, Wikipedia, and Tavily API.
        
        Given the following context:
        "{context}"
        
        Fact-check this claim: "{claim}"
        
        Important rules:
        1. Ignore spelling mistakes.
        2. Always use the "Resilient Search" tool to find three relevant, credible URLs (even if the claim is unverifiable).
        3. If the claim is unverifiable, still attempt to find URLs related to the topic for reference.

        Final Answer: <respond in JSON like below>
        
        {{
          "verdict": "<Correct | Incorrect | Unverifiable>",
          "explanation": "<explanation>","sources": ["<url1>", "<url2>", "<url3>"]}}
        
        """
    
        
        try:
            result = agent.invoke(prompt)
            parsed_output = extract_json_from_output(result)
            return parsed_output
        except Exception as e:
            print(f"Error in fact-checking: {e}")
            return {
                "verdict": "Error", 
                "explanation": f"An error occurred: {str(e)}", 
                "sources": []
            }

    # Process each fact
    running_output = ""
    for i, fact in enumerate(facts):
        # Update with progress indicator
        yield embed_html, transcript, f"{running_output}\n\n🔍 Checking claim {i+1} of {len(facts)}..."
        
        # Fact check using the pre-initialized agent
        result = fact_check_with_agent(fact['sentence'], fact['context'])
        
        # Format for display
        verdict = result.get("verdict", "Unverifiable")
        color = "green" if verdict.lower() == "correct" else "red" if verdict.lower() == "incorrect" else "orange"
        
        # Handle sources
        sources_text = ""
        if "sources" in result and result["sources"]:
            for url in result["sources"]:
                sources_text += f"- [{url}]({url})\n"
        else:
            sources_text = "- No sources provided"
            
        result_md = f"""
### 📌 **Claim:** {fact['sentence']}

**Verdict:** <span style='color:{color}; font-weight:bold'>{verdict}</span>  
**Explanation:** {result.get("explanation", "No explanation provided.")}  

**Sources:**  
{sources_text}

---
"""
        running_output += result_md
        yield embed_html, transcript, running_output
        time.sleep(4)

# Gradio UI
with gr.Blocks() as demo:
    gr.HTML("""
    <style>
        #claims-container {
            max-height: 400px;
            overflow-y: auto;
            padding: 10px;
            border: 1px solid #ccc;
            border-radius: 8px;
        }
        iframe {
            display: block;
            margin-left: auto;
            margin-right: auto;
        }
    </style>
    """)

    gr.Markdown("# 📺🔍 YouTube Video Fact-Checker")
    gr.Markdown(
        "👉 [Get your Gemini API key here](https://aistudio.google.com/apikey) and paste it below."
    )

    with gr.Row():
        api_key_input = gr.Textbox(label="🔑 Gemini API Key", type="password", placeholder="Enter your Gemini API key...")
        video_url_input = gr.Textbox(label="🎥 YouTube Video URL", placeholder="Paste a YouTube URL...")

    run_btn = gr.Button("🚀 Run Fact-Check")

    with gr.Row(variant="center"):
        video_display = gr.HTML()

    transcript_out = gr.Textbox(label="📜 Video Transcript", lines=8)
    claims_out = gr.Markdown(elem_id="claims-container")

    run_btn.click(
        fn=process_video,
        inputs=[video_url_input, api_key_input],
        outputs=[video_display, transcript_out, claims_out]
    )

if __name__ == "__main__":
    demo.launch()

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


* Running on local URL:  http://127.0.0.1:7863
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://0702c47af42272af92.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


  agent = initialize_agent(
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


DuckDuckGo failed: https://lite.duckduckgo.com/lite/ 202 Ratelimit. Trying Wikipedia...
DuckDuckGo failed: https://lite.duckduckgo.com/lite/ 202 Ratelimit. Trying Wikipedia...
DuckDuckGo failed: https://html.duckduckgo.com/html 202 Ratelimit. Trying Wikipedia...
