In [None]:
##showcasing only gate prev papers
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the websites to scrape
websites = {
    "GFG_PastPapers": "https://www.geeksforgeeks.org/gate-cse-previous-year-papers/",
    "TutorialsPoint": "https://www.tutorialspoint.com/gate_exam/gate_computer_science.htm"
}

# Function to scrape data from a given URL
def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Extract headings and lists
    topics = []
    for h2 in soup.find_all("h2"):
        subtopics = [li.text for li in h2.find_next_sibling("ul").find_all("li")] if h2.find_next_sibling("ul") else []
        topics.append({"Topic": h2.text, "Subtopics": subtopics, "Source": url})
    
    return topics

# Scrape data from all websites
data = []
for site_name, url in websites.items():
    scraped_data = scrape_website(url)
    data.extend(scraped_data)

# Convert to DataFrame
df = pd.DataFrame(data)
print(df.head())  # Check extracted data

# Save as JSON for further processing
df.to_json("gate_knowledge_base.json", orient="records", indent=4)
print("✅ Web scraping completed! Data saved in JSON format.")


In [2]:
pip install requests beautifulsoup4 pandas pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-win_amd64.whl (16.5 MB)
   ---------------------------------------- 0.0/16.5 MB ? eta -:--:--
    --------------------------------------- 0.3/16.5 MB ? eta -:--:--
   - -------------------------------------- 0.8/16.5 MB 2.1 MB/s eta 0:00:08
   ---- ----------------------------------- 1.8/16.5 MB 3.0 MB/s eta 0:00:05
   ------ --------------------------------- 2.6/16.5 MB 3.3 MB/s eta 0:00:05
   -------- ------------------------------- 3.4/16.5 MB 3.5 MB/s eta 0:00:04
   ---------- ----------------------------- 4.5/16.5 MB 3.6 MB/s eta 0:00:04
   ------------ --------------------------- 5.2/16.5 MB 3.7 MB/s eta 0:00:04
   ------------- -------------------------- 5.8/16.5 MB 3.7 MB/s eta 0:00:03
   --------------- ------------------------ 6.6/16.5 MB 3.6 MB/s eta 0:00:03
   --------------- ------------------------ 6.6/16.5 MB 3.6 MB/s eta 0:00:03
   -------------

In [3]:
##pdf extraction
import requests
import os
import json
import fitz  # PyMuPDF for PDF text extraction
from bs4 import BeautifulSoup

# URL of the webpage to scrape
URL = "https://www.geeksforgeeks.org/original-gate-previous-year-question-papers-cse-and-it-gq/"

# Directory to store downloaded PDFs
PDF_DIR = "gate_papers"
os.makedirs(PDF_DIR, exist_ok=True)

# Function to scrape PDF links from GFG
def scrape_pdf_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    pdf_links = []
    for link in soup.find_all("a", href=True):  # Find all links
        href = link["href"]
        if "gate" in href.lower() and href.endswith(".pdf"):  # Check if it's a GATE PDF
            pdf_links.append({
                "Year": link.text.strip(),  # Extract year or text
                "PDF_Link": href
            })
    
    return pdf_links

# Function to download PDFs
def download_pdf(pdf_url, year):
    response = requests.get(pdf_url, stream=True)
    pdf_path = os.path.join(PDF_DIR, f"GATE_{year}.pdf")

    with open(pdf_path, "wb") as pdf_file:
        for chunk in response.iter_content(1024):
            pdf_file.write(chunk)
    
    print(f"✅ Downloaded: {pdf_path}")
    return pdf_path

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    
    for page in doc:
        text += page.get_text("text") + "\n"  # Extract text from each page
    
    return text.strip()

# Scrape PDF links
pdf_links = scrape_pdf_links(URL)

# Download PDFs and extract text
gate_questions = []
for pdf in pdf_links:
    year = pdf["Year"]
    pdf_url = pdf["PDF_Link"]

    # Download the PDF
    pdf_path = download_pdf(pdf_url, year)

    # Extract text (questions)
    extracted_text = extract_text_from_pdf(pdf_path)

    # Store in JSON format
    gate_questions.append({
        "Year": year,
        "PDF_Link": pdf_url,
        "Extracted_Text": extracted_text
    })

# Save extracted questions in JSON
json_file = "gate_questions.json"
with open(json_file, "w", encoding="utf-8") as f:
    json.dump(gate_questions, f, indent=4)

print(f"\n✅ GATE questions saved in {json_file}!")


✅ Downloaded: gate_papers\GATE_2023 Question Paper.pdf
✅ Downloaded: gate_papers\GATE_2023 Keys.pdf
✅ Downloaded: gate_papers\GATE_2022 Question Paper.pdf
✅ Downloaded: gate_papers\GATE_2022 Keys.pdf
✅ Downloaded: gate_papers\GATE_2021 Set 1 Paper.pdf
✅ Downloaded: gate_papers\GATE_2021 Set 2 Paper.pdf
✅ Downloaded: gate_papers\GATE_2020 Paper.pdf
✅ Downloaded: gate_papers\GATE_2020 Keys.pdf
✅ Downloaded: gate_papers\GATE_2018 Paper.pdf
✅ Downloaded: gate_papers\GATE_2017 Set 1 Paper.pdf
✅ Downloaded: gate_papers\GATE_2017 Set 1 Keys.pdf
✅ Downloaded: gate_papers\GATE_2017 Set 2 Paper.pdf
✅ Downloaded: gate_papers\GATE_2017 Set 2 Keys.pdf
✅ Downloaded: gate_papers\GATE_2016 Set 1 Paper.pdf
✅ Downloaded: gate_papers\GATE_2016 Set 2 Paper.pdf
✅ Downloaded: gate_papers\GATE_2015 Set 1 Paper.pdf
✅ Downloaded: gate_papers\GATE_2015 Set 2 Paper.pdf
✅ Downloaded: gate_papers\GATE_2015 Set 3 Paper.pdf
✅ Downloaded: gate_papers\GATE_2014 Set 1 Paper.pdf
✅ Downloaded: gate_papers\GATE_2014 Set 2

In [4]:
import json

# Load extracted GATE questions JSON
with open("gate_questions.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Display first few entries
data[:3]  # Show first 3 records


[{'Year': '2023 Question Paper',
  'PDF_Link': 'https://media.geeksforgeeks.org/wp-content/cdn-uploads/20230804115257/GATE-20231.pdf',
  'Extracted_Text': 'Computer Science and Information Technology (CS)  \nPage 1 of 7 \nOrganizing Institute: IIT Kanpur \nGeneral Aptitude (GA) \n – Q.5 Carry ONE mark Each \nQ.1 \nWe reached the station late, and _______ missed the train. \n \n \n(A) \nnear \n(B) \nnearly \n(C) \nutterly  \n(D) \nmostly \n \n \n \nQ.2 \nKind : _______ : : Often : Frequently \n(By word meaning) \n \n \n(A) \nMean \n \n(B) \nType \n(C) \nCruel \n(D) \nKindly \n \n \n \n \nQ.1\n\n             Computer Science and Information Technology (CS)  \nPage 2 of 7 \nOrganizing Institute: IIT Kanpur \nQ.3 \nA series of natural numbers 𝐹1, 𝐹2, 𝐹3, 𝐹4, 𝐹5, 𝐹6, 𝐹7, … obeys 𝐹𝑛+1 = 𝐹𝑛+ 𝐹𝑛−1 \nfor all integers 𝑛≥2 .  \nIf 𝐹6 = 37, and 𝐹7 = 60, then what is 𝐹1 ? \n \n \n(A) \n4 \n(B) \n5 \n(C) \n8 \n(D) \n9 \n \n \n \n \n \n\n             Computer Science and Information Technology (CS)  

In [5]:
import requests
import os
import json
import pandas as pd
import fitz  # PyMuPDF for PDF extraction
from bs4 import BeautifulSoup

# List of websites to scrape
SITES = {
    "GeeksforGeeks": "https://www.geeksforgeeks.org/original-gate-previous-year-question-papers-cse-and-it-gq/",
    "GateOverflow": "https://gateoverflow.in/",
    "NPTEL": "https://nptel.ac.in/courses",
    "GATE Official": "https://gate.iitk.ac.in/"
}

# Directory to store downloaded PDFs
PDF_DIR = "gate_pdfs"
os.makedirs(PDF_DIR, exist_ok=True)

# Function to scrape data from a website
def scrape_site(url, site_name):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    scraped_data = []

    # Extract links and content (customize this for each site)
    for link in soup.find_all("a", href=True):
        href = link["href"]
        text = link.get_text(strip=True)

        if "pdf" in href.lower():  # Check for PDFs
            scraped_data.append({"Site": site_name, "Title": text, "Link": href, "Type": "PDF"})
        else:
            scraped_data.append({"Site": site_name, "Title": text, "Link": href, "Type": "Article"})

    return scraped_data

# Function to download PDFs
def download_pdf(pdf_url, filename):
    pdf_path = os.path.join(PDF_DIR, filename)
    response = requests.get(pdf_url, stream=True)

    with open(pdf_path, "wb") as file:
        for chunk in response.iter_content(1024):
            file.write(chunk)

    print(f"✅ Downloaded PDF: {filename}")
    return pdf_path

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text.strip()

# Scrape all sites
all_data = []
for site, url in SITES.items():
    print(f"🔍 Scraping {site}...")
    site_data = scrape_site(url, site)
    all_data.extend(site_data)

# Store data in a DataFrame
df = pd.DataFrame(all_data)

# Download & extract text from PDFs
for index, row in df[df["Type"] == "PDF"].iterrows():
    filename = row["Title"].replace(" ", "_") + ".pdf"
    pdf_path = download_pdf(row["Link"], filename)
    extracted_text = extract_text_from_pdf(pdf_path)

    # Store extracted text in DataFrame
    df.at[index, "Extracted_Text"] = extracted_text

# Save as JSON
df.to_json("gate_knowledge_base.json", orient="records", indent=4, force_ascii=False)

print("\n✅ Data scraping completed! JSON saved as gate_knowledge_base.json")

🔍 Scraping GeeksforGeeks...
🔍 Scraping GateOverflow...
🔍 Scraping NPTEL...
🔍 Scraping GATE Official...
✅ Downloaded PDF: 2024_Set_1_Paper.pdf
✅ Downloaded PDF: 2024_Set_1_Keys.pdf
✅ Downloaded PDF: 2024_Set_2_Paper.pdf
✅ Downloaded PDF: 2024_Set_2_Keys.pdf
✅ Downloaded PDF: 2024_Question_Paper.pdf
✅ Downloaded PDF: 2024_Keys.pdf
✅ Downloaded PDF: 2023_Question_Paper.pdf
✅ Downloaded PDF: 2023_Keys.pdf
✅ Downloaded PDF: 2022_Question_Paper.pdf
✅ Downloaded PDF: 2022_Keys.pdf
✅ Downloaded PDF: 2021_Set_1_Paper.pdf
✅ Downloaded PDF: 2021_Set_1_Keys.pdf
✅ Downloaded PDF: 2021_Set_2_Paper.pdf
✅ Downloaded PDF: 2021_Set_2_Keys.pdf
✅ Downloaded PDF: 2020_Paper.pdf
✅ Downloaded PDF: 2020_Keys.pdf
✅ Downloaded PDF: 2019_Paper.pdf
✅ Downloaded PDF: 2019_Keys.pdf
✅ Downloaded PDF: 2018_Paper.pdf
✅ Downloaded PDF: 2018_Keys.pdf
✅ Downloaded PDF: 2017_Set_1_Paper.pdf
✅ Downloaded PDF: 2017_Set_1_Keys.pdf
✅ Downloaded PDF: 2017_Set_2_Paper.pdf
✅ Downloaded PDF: 2017_Set_2_Keys.pdf
✅ Downloaded PDF: 

FileDataError: Failed to open file 'gate_pdfs\\Brochure.pdf'.

In [6]:
import os

pdf_path = "gate_pdfs/Brochure.pdf"  # Path to the PDF file
if not os.path.exists(pdf_path):
    print("❌ PDF file not found. Check the file path and download process.")
else:
    print("✅ PDF file exists. Proceeding to extract text...")


✅ PDF file exists. Proceeding to extract text...


In [27]:
import json

# Load extracted GATE questions JSON
with open("gate_questions.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Display first few entries
#data[:3]  # Show first 3 records


In [8]:
from sentence_transformers import SentenceTransformer

# Load a local NLP model (No API required)
model = SentenceTransformer("all-MiniLM-L6-v2")  # Small & efficient

# Generate embeddings for each question
for item in data:
    item["embedding"] = model.encode(item["Extracted_Text"]).tolist()  # Store as list

# Save updated JSON with embeddings
with open("gate_questions_embeddings.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4)

print("✅ Embeddings generated and saved!")


✅ Embeddings generated and saved!


In [9]:
import faiss
import numpy as np

# Load the JSON with embeddings
with open("gate_questions_embeddings.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert embeddings to NumPy array
embeddings = np.array([item["embedding"] for item in data]).astype("float32")

# Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance search
index.add(embeddings)  # Add all embeddings

# Save FAISS index
faiss.write_index(index, "gate_faiss.index")

print("✅ FAISS index created and saved!")


✅ FAISS index created and saved!


In [26]:
def search_gate_gpt(query, top_k=3):
    query_embedding = model.encode(query).reshape(1, -1).astype("float32")  # Encode query
    D, I = index.search(query_embedding, top_k)  # Search FAISS index

    results = []
    for idx in I[0]:
        if idx < len(data):
            results.append(data[idx]["Extracted_Text"])  # Retrieve related questions

    return results

# Example: Searching for "Linear Algebra questions from 2024"
query = "Linear Algebra questions from 2024"
retrieved_questions = search_gate_gpt(query)

# Display results
#for i, q in enumerate(retrieved_questions, 1):
#    print(f"🔹 {i}. {q}\n")


In [25]:
def generate_explanation(questions):
    explanation = "Here are the most relevant GATE questions:\n\n"
    for i, q in enumerate(questions, 1):
        explanation += f"{i}. {q}\n\n"
    
    explanation += "To answer these, revise key concepts related to them!"
    return explanation

# Example: Asking GATE GPT
query = "Explain linear algebra questions from 2024"
questions = search_gate_gpt(query)



In [12]:
pip install grobid-client sentence-transformers faiss-cpu jsonlines


Collecting grobid-client
  Downloading grobid_client-0.8.8-py3-none-any.whl.metadata (3.3 kB)
Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting attrs<22.0.0,>=20.1.0 (from grobid-client)
  Downloading attrs-21.4.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting httpx==0.23.0 (from grobid-client)
  Downloading httpx-0.23.0-py3-none-any.whl.metadata (52 kB)
Collecting lxml<5.0.0,>=4.7.1 (from grobid-client)
  Downloading lxml-4.9.4-cp312-cp312-win_amd64.whl.metadata (3.8 kB)
Collecting rfc3986<2,>=1.3 (from rfc3986[idna2008]<2,>=1.3->httpx==0.23.0->grobid-client)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore<0.16.0,>=0.15.0 (from httpx==0.23.0->grobid-client)
  Downloading httpcore-0.15.0-py3-none-any.whl.metadata (15 kB)
Collecting h11>=0.12.0 (from grobid-client)
  Downloading h11-0.12.0-py3-none-any.whl.metadata (8.1 kB)
Collecting anyio==3.* (from httpcore<0.16.0,>=0.15.0->httpx==0.23.0->grobid-

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jsonschema 4.23.0 requires attrs>=22.2.0, but you have attrs 21.4.0 which is incompatible.
jupyterlab 4.2.5 requires httpx>=0.25.0, but you have httpx 0.23.0 which is incompatible.
referencing 0.30.2 requires attrs>=22.2.0, but you have attrs 21.4.0 which is incompatible.


NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


In [15]:
pip install selenium beautifulsoup4 sentence-transformers faiss-cpu jsonlines


Collecting selenium
  Downloading selenium-4.29.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting attrs>=19.2.0 (from jsonlines)
  Downloading attrs-25.1.0-py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.29.0-py3-none-any.whl (9.5 MB)
   ---------------------------------------- 0.0/9.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.5 MB ? eta -:--:--
   -----------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
grobid-client 0.8.8 requires attrs<22.0.0,>=20.1.0, but you have attrs 25.1.0 which is incompatible.


In [24]:
##web scraping using selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import json
import time

# Configure Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run Chrome in headless mode
chrome_driver_path = "C:\\Users\\ASUS\\.wdm\\drivers\\chromedriver\\win64\\133.0.6943.141\\chromedriver-win32/chromedriver.exe"  # Update path to your ChromeDriver

# Initialize WebDriver
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Website URLs to scrape
urls = [
    "https://www.geeksforgeeks.org/gate-cse-syllabus/",
    "https://www.geeksforgeeks.org/original-gate-previous-year-question-papers-cse-and-it-gq/"
]

scraped_data = []

for url in urls:
    driver.get(url)
    time.sleep(2)  # Give time for page to load

    # Parse HTML using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Extract main content (Modify this based on website structure)
    content_div = soup.find("div", class_="entry-content")

    if content_div:
        text = content_div.get_text(separator=" ", strip=True)
        scraped_data.append({"url": url, "content": text})

# Close WebDriver
driver.quit()

# Save scraped data to JSON
with open("gate_web_data.json", "w", encoding="utf-8") as f:
    json.dump(scraped_data, f, indent=4, ensure_ascii=False)

print("✅ Web scraping completed. Data saved as JSON.")


✅ Web scraping completed. Data saved as JSON.


In [18]:
pip install webdriver-manager


Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver-manager
Successfully installed webdriver-manager-4.0.2
Note: you may need to restart the kernel to use updated packages.


In [19]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

chrome_options = Options()
chrome_options.add_argument("--headless")  # Optional: Run in headless mode

# ✅ Automatically downloads & manages ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

driver.get("https://www.google.com")
print(driver.title)

driver.quit()


Google


In [21]:
import shutil

chrome_driver_path = shutil.which("chromedriver")
if chrome_driver_path:
    print(f"ChromeDriver found at: {chrome_driver_path}")
else:
    print("ChromeDriver not found in PATH")


ChromeDriver not found in PATH


In [22]:
from webdriver_manager.chrome import ChromeDriverManager

chrome_driver_path = ChromeDriverManager().install()
print(f"ChromeDriver path: {chrome_driver_path}")


ChromeDriver path: C:\Users\ASUS\.wdm\drivers\chromedriver\win64\133.0.6943.141\chromedriver-win32/chromedriver.exe


In [29]:
##converting json to df
import pandas as pd
import json

# Load JSON file
with open("gate_web_data.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Convert JSON to DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

# Save as CSV (optional)
df.to_csv("output.csv", index=False)


Empty DataFrame
Columns: []
Index: []


In [30]:
##displaying json file in jupyter notebook
import json

# Load JSON file
with open("gate_web_data.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Pretty print JSON
print(json.dumps(data, indent=4))


[]


In [31]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import json

# Configure Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run Chrome in headless mode
chrome_driver_path = "C:\\Users\\ASUS\\.wdm\\drivers\\chromedriver\\win64\\133.0.6943.141\\chromedriver-win32/chromedriver.exe"

# Initialize WebDriver
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Website URLs to scrape
urls = [
    "https://www.geeksforgeeks.org/gate-cse-syllabus/",
    "https://www.geeksforgeeks.org/original-gate-previous-year-question-papers-cse-and-it-gq/"
]

scraped_data = []

for url in urls:
    driver.get(url)

    # Wait until page loads
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, "article"))
    )

    # Parse HTML using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Extract content from article
    article = soup.find("article")
    if article:
        text = article.get_text(separator=" ", strip=True)
        scraped_data.append({"url": url, "content": text})
    else:
        print(f"❌ No content found for {url}")

# Close WebDriver
driver.quit()

# Save scraped data to JSON
with open("gate_web_data.json", "w", encoding="utf-8") as f:
    json.dump(scraped_data, f, indent=4, ensure_ascii=False)

print("✅ Web scraping completed. Data saved as JSON.")


✅ Web scraping completed. Data saved as JSON.


In [35]:
import json

# Load JSON file
with open("gate_web_data.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Pretty print JSON
#print(json.dumps(data, indent=4))


In [33]:
##converting json to df
import pandas as pd
import json

# Load JSON file
with open("gate_web_data.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Convert JSON to DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

# Save as CSV (optional)
df.to_csv("output.csv", index=False)


                                                 url  \
0   https://www.geeksforgeeks.org/gate-cse-syllabus/   
1  https://www.geeksforgeeks.org/original-gate-pr...   

                                             content  
0  GATE 2025 Syllabus For CSE (Computer Science &...  
1  GATE CSE and IT Previous Years Papers PDF Down...  


In [34]:
##scraping web inside pdf also
import os
import requests
import json
import fitz  # PyMuPDF for PDF extraction
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Configure Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_driver_path = "C:\\Users\\ASUS\\.wdm\\drivers\\chromedriver\\win64\\133.0.6943.141\\chromedriver-win32/chromedriver.exe"

# Initialize WebDriver
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# List of URLs to scrape
urls = [
    "https://www.geeksforgeeks.org/gate-cse-syllabus/",
    "https://www.geeksforgeeks.org/original-gate-previous-year-question-papers-cse-and-it-gq/"
]

scraped_data = []

# Create a directory to store PDFs
os.makedirs("gate_pdfs", exist_ok=True)

# Function to download PDFs
def download_pdf(pdf_url, filename):
    pdf_path = os.path.join("gate_pdfs", filename)
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(pdf_path, "wb") as file:
            file.write(response.content)
        return pdf_path
    return None

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join([page.get_text("text") for page in doc])
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None

for url in urls:
    driver.get(url)

    # Wait for page to load
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "article")))

    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Extract main content
    article = soup.find("article")
    content_text = article.get_text(separator=" ", strip=True) if article else "No content found"

    # Find all PDF links
    pdf_links = []
    for a_tag in soup.find_all("a", href=True):
        if a_tag["href"].endswith(".pdf"):  # Check if link ends with .pdf
            pdf_links.append(a_tag["href"])

    pdf_texts = []
    for pdf_url in pdf_links:
        filename = pdf_url.split("/")[-1]  # Extract filename from URL
        pdf_path = download_pdf(pdf_url, filename)
        if pdf_path:
            pdf_text = extract_text_from_pdf(pdf_path)
            if pdf_text:
                pdf_texts.append({"pdf_url": pdf_url, "text": pdf_text})

    # Store data
    scraped_data.append({
        "url": url,
        "content": content_text,
        "pdfs": pdf_texts
    })

# Close WebDriver
driver.quit()

# Save data to JSON
with open("gate_web_data_with_pdfs.json", "w", encoding="utf-8") as f:
    json.dump(scraped_data, f, indent=4, ensure_ascii=False)

print("✅ Web scraping & PDF extraction completed. Data saved as JSON.")


MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource

In [37]:
#displating data in json formate
import json

# Load JSON file
with open("gate_web_data_with_pdfs.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Pretty print JSON
print(json.dumps(data, indent=4))

[
    {
        "url": "https://www.geeksforgeeks.org/gate-cse-syllabus/",
        "content": "GATE 2025 Syllabus For CSE (Computer Science & Engineering) Last Updated : 28 Dec, 2024 Comments Improve Suggest changes 127 Likes Like Share Report Follow GATE Exam 2025 Syllabus for CSE \u2013 GATE stands for Graduate Aptitude Test in Engineering , an entrance exam conducted each year for getting admission into the most prestigious institutes across the country including IISc Banglore, IITs, NITs, IIITs, and many others. The GATE authority (IIT Roorkee for this year) has released the official notification regarding the GATE 2025 exam. Candidates who are willingly interested in pursuing their higher studies from one of the most esteemed institutes in the country or have the ambition to join PSUs and want to dig deep into their core sectors can take the GATE examination in 2025. GATE CSE Syllabus 2025 The GATE Syllabus comprises two sections. The first one is the General Aptitude Section whic

In [38]:
##removes image containing image
import os
import json
import fitz  # PyMuPDF
import re
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Configure Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_driver_path = "C:\\Users\\ASUS\\.wdm\\drivers\\chromedriver\\win64\\133.0.6943.141\\chromedriver-win32/chromedriver.exe"

# Initialize WebDriver
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# List of URLs to scrape
urls = [
    "https://www.geeksforgeeks.org/gate-cse-syllabus/",
    "https://www.geeksforgeeks.org/original-gate-previous-year-question-papers-cse-and-it-gq/"
]

scraped_data = []
os.makedirs("gate_pdfs", exist_ok=True)  # Create directory for PDFs

def download_pdf(pdf_url, filename):
    """Download PDF from URL."""
    pdf_path = os.path.join("gate_pdfs", filename)
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(pdf_path, "wb") as file:
            file.write(response.content)
        return pdf_path
    return None

def contains_unreadable_symbols(text):
    """Check if the extracted text contains unreadable mathematical symbols."""
    math_symbols = r"[∑√π∞∫∆∇⊗⊕≠≤≥±≈≡θλφψΩ∂∃∅⋅]"
    return bool(re.search(math_symbols, text))

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF and remove questions with scanned images or unreadable math symbols."""
    try:
        doc = fitz.open(pdf_path)
        extracted_text = []
        
        for page in doc:
            text = page.get_text("text")
            if text.strip() and not contains_unreadable_symbols(text):
                extracted_text.append(text.strip())
        
        return "\n".join(extracted_text) if extracted_text else None
    
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None

def is_image_based_pdf(pdf_path):
    """Check if a PDF contains actual text or is an image-based (scanned) PDF."""
    doc = fitz.open(pdf_path)
    for page in doc:
        text = page.get_text("text")
        if text.strip():  # If readable text exists
            return False  # Not a scanned PDF
    return True  # If no text, it's image-based

for url in urls:
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "article")))
    soup = BeautifulSoup(driver.page_source, "html.parser")

    article = soup.find("article")
    content_text = article.get_text(separator=" ", strip=True) if article else "No content found"

    pdf_links = [a["href"] for a in soup.find_all("a", href=True) if a["href"].endswith(".pdf")]

    pdf_texts = []
    for pdf_url in pdf_links:
        filename = pdf_url.split("/")[-1]
        pdf_path = download_pdf(pdf_url, filename)
        if pdf_path:
            pdf_text = extract_text_from_pdf(pdf_path)
            if pdf_text:  # Only add if it's not a scanned image and has no unreadable symbols
                pdf_texts.append({"pdf_url": pdf_url, "text": pdf_text})
            else:
                print(f"🚨 Skipped unreadable/scanned question: {pdf_url}")

    scraped_data.append({
        "url": url,
        "content": content_text,
        "pdfs": pdf_texts
    })

driver.quit()

# Save data to JSON
with open("gate_web_data_cleaned.json", "w", encoding="utf-8") as f:
    json.dump(scraped_data, f, indent=4, ensure_ascii=False)

print("✅ Scraping completed. Unreadable questions removed.")


🚨 Skipped unreadable/scanned question: https://media.geeksforgeeks.org/wp-content/cdn-uploads/20211005202724/GATE2011.pdf
🚨 Skipped unreadable/scanned question: https://media.geeksforgeeks.org/wp-content/cdn-uploads/20211005202752/GATE2010.pdf
🚨 Skipped unreadable/scanned question: https://media.geeksforgeeks.org/wp-content/cdn-uploads/20211005202818/GATE2009.pdf
🚨 Skipped unreadable/scanned question: https://media.geeksforgeeks.org/wp-content/cdn-uploads/20211005202849/GATE2008.pdf
MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF error: syntax error: cannot find XObject resource 'Fm0'

MuPDF er

In [39]:
##displaying json
import json

# Load JSON file
with open("gate_web_data_cleaned.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Pretty print JSON
print(json.dumps(data, indent=4))

[
    {
        "url": "https://www.geeksforgeeks.org/gate-cse-syllabus/",
        "content": "GATE 2025 Syllabus For CSE (Computer Science & Engineering) Last Updated : 28 Dec, 2024 Comments Improve Suggest changes 127 Likes Like Share Report Follow GATE Exam 2025 Syllabus for CSE \u2013 GATE stands for Graduate Aptitude Test in Engineering , an entrance exam conducted each year for getting admission into the most prestigious institutes across the country including IISc Banglore, IITs, NITs, IIITs, and many others. The GATE authority (IIT Roorkee for this year) has released the official notification regarding the GATE 2025 exam. Candidates who are willingly interested in pursuing their higher studies from one of the most esteemed institutes in the country or have the ambition to join PSUs and want to dig deep into their core sectors can take the GATE examination in 2025. GATE CSE Syllabus 2025 The GATE Syllabus comprises two sections. The first one is the General Aptitude Section whic

In [40]:
##pdf to json formate
import pdfplumber
import json

def pdf_to_json(pdf_path, json_path):
    data = {"pages": []}
    
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            data["pages"].append({"page": i + 1, "text": text if text else "No text found"})
    
    with open(json_path, "w", encoding="utf-8") as json_file:
        json.dump(data, json_file, indent=4, ensure_ascii=False)

    print(f"✅ PDF converted to JSON: {json_path}")

# Convert PDF to JSON
pdf_to_json("CS224S6.pdf", "output.json")


ModuleNotFoundError: No module named 'pdfplumber'

In [41]:
pip install pdfplumber


Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-win_amd64.whl.metadata (48 kB)
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   --- ------------------------------------ 0.5/5.6 MB 4.2 MB/s eta 0:00:02
   ----------- ---------------------------- 1.6/5.6 MB 3.5 MB/s eta 0:00:02
   -------------- ------------------------- 2.1/5.6 MB 3.1 MB/s eta 0:00:02
   ------------------ --------------------- 2.6/5.6 MB 3.3 MB/s eta 0:00:01
   ------------------ --------------------- 2.6/5.6 MB 3.3 MB/s eta 0:00:01
   ------------------ --------------------- 2.6/5.6 MB 3.3 MB/s eta 0:00:01
   ------------------ ---------

In [42]:
##pdf to json formate
import pdfplumber
import json

def pdf_to_json(pdf_path, json_path):
    data = {"pages": []}
    
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            data["pages"].append({"page": i + 1, "text": text if text else "No text found"})
    
    with open(json_path, "w", encoding="utf-8") as json_file:
        json.dump(data, json_file, indent=4, ensure_ascii=False)

    print(f"✅ PDF converted to JSON: {json_path}")

# Convert PDF to JSON
pdf_to_json("CS224S6.pdf", "output.json")


✅ PDF converted to JSON: output.json


In [44]:
#displating data in json formate
import json

# Load JSON file
with open("output.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Pretty print JSON
print(json.dumps(data, indent=4))

{
    "pages": [
        {
            "page": 1,
            "text": "Computer Science and Information Technology Set 2 (CS2)\nGeneral Aptitude (GA)\nQ.1 \u2013 Q.5 Carry ONE mark Each\nQ.1 If \u2018\u2192\u2019 denotes increasing order of intensity, then the meaning of the words\n[walk \u2192 jog \u2192 sprint] is analogous to [bothered \u2192 ________ \u2192 daunted].\nWhich one of the given options is appropriate to fill the blank?\n(A) phased\n(B) phrased\n(C) fazed\n(D) fused\nQ.2 Two wizards try to create a spell using all the four elements, water, air, fire, and\nearth. For this, they decide to mix all these elements in all possible orders. They\nalso decide to work independently. After trying all possible combination of\nelements, they conclude that the spell does not work.\nHow many attempts does each wizard make before coming to this conclusion,\nindependently?\n(A) 24\n(B) 48\n(C) 16\n(D) 12\nPage 1 of 40\nOrganizing Institute: IISc, Bengaluru"
        },
        {
        

In [46]:
##converting into json formate from pdf as per question bifurcation(1 set of year paer)
import pdfplumber
import json
import re

def extract_questions_from_pdf(pdf_path):
    """
    Extracts questions from a PDF and organizes them by question number.
    """
    questions = {}
    current_question = None

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                lines = text.split("\n")

                for line in lines:
                    # Check if line starts with a question number (e.g., "1.", "2)", "(3)")
                    match = re.match(r"^\(?(\d+)[.)]", line.strip())  
                    if match:
                        current_question = match.group(1)  # Extract question number
                        questions[current_question] = line.strip()  # Start new question entry
                    elif current_question:
                        questions[current_question] += " " + line.strip()  # Append to current question

    return questions

# Convert extracted questions to JSON and save
def save_to_json(data, output_file):
    """
    Saves extracted question data to a JSON file.
    """
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

# Specify the PDF file path and output JSON file
pdf_path = "CS224S6.pdf"  # Change to your actual PDF file path
output_json = "gate_questions.json"

# Extract and save questions
questions_data = extract_questions_from_pdf(pdf_path)
save_to_json(questions_data, output_json)

print(f"✅ PDF converted to JSON successfully! Saved as {output_json}")


✅ PDF converted to JSON successfully! Saved as gate_questions.json


In [48]:
#displating data in json formate
import json

# Load JSON file
with open("gate_questions.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Pretty print JSON
print(json.dumps(data, indent=4))

{
    "1": "(1)",
    "0": "(0) + \ud835\udc5d \ud835\udc5f",
    "2": "(2) uminus (1)",
    "3": "(3)",
    "4": "(4) / \ud835\udc62 \ud835\udc64",
    "5": "(5) + (3) (4)",
    "6": "(6)",
    "7": "(7) = (6) (5) Which one of the following options fills in the missing entries CORRECTLY? (A) (1) =[]\u2061\u2061\ud835\udc60\u2061\u2061\ud835\udc56 (3) * (0) (2) (6) []=\u2061\u2061\ud835\udc65\u2061\u2061\ud835\udc56 (B) (1) []=\u2061\u2061\ud835\udc60\u2061\u2061\ud835\udc56 (3) \u2013\u2061(0)\u2061(2) (6) =[] \u2061\ud835\udc65\u2061\u2061(5) (C) (1) =[] \u2061\ud835\udc60\u2061\ud835\udc56 (3) * (0)\u2061\u2061(2) (6) []= \u2061\ud835\udc65\u2061\u2061(5) (D) (1) []= \u2061\ud835\udc60\u2061\u2061\ud835\udc56 (3) \u2013\u2061(0)\u2061(2) (6) =[] \ud835\udc65\u2061\u2061\ud835\udc56 Page 30 of 40 Organizing Institute: IISc, Bengaluru Computer Science and Information Technology Set 2 (CS2) Q.44 Let \ud835\udc65 and \ud835\udc66 be random variables, not necessarily independent, that ta