In [1]:
pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the website to scrape
url = "https://www.geeksforgeeks.org/gate-cse-syllabus/?utm_source=chatgpt.com"

# Send a request to fetch the webpage
headers = {"User-Agent": "Mozilla/5.0"}  # Prevent blocking
response = requests.get(url, headers=headers)
html_content = response.text

# Parse HTML using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

# Extract syllabus topics
syllabus_sections = soup.find_all("h2")  # Assuming major topics are in <h2> tags
syllabus_data = []

for section in syllabus_sections:
    topic = section.text.strip()
    subtopics = []
    
    # Find the next <ul> (unordered list) after <h2> for subtopics
    ul = section.find_next("ul")
    if ul:
        subtopics = [li.text.strip() for li in ul.find_all("li")]
    
    syllabus_data.append({"Topic": topic, "Subtopics": ", ".join(subtopics)})

# Convert to DataFrame
df = pd.DataFrame(syllabus_data)

# Display the DataFrame
print(df)

# Save to CSV file
df.to_csv("gate_cse_syllabus.csv", index=False)

print("✅ Syllabus scraped and saved to 'gate_cse_syllabus.csv'!")


                                           Topic  \
0                    More previous year Question   
1                         GATE CSE Syllabus 2025   
2      GATE 2025 Expected Subject-Wise Weightage   
3                           GATE 2025 Exam Dates   
4                         GATE 2025 Exam Pattern   
5                  How to Prepare for GATE 2025?   
6                                     Conclusion   
7                  GATE 2025 CSE Syllabus – FAQs   
8  What kind of Experience do you want to share?   

                                           Subtopics  
0  GATE-CS-2015 (Set 1), GATE-CS-2015 (Set 2), GA...  
1  Propositional and First-Order Logic, Sets, Rel...  
2  Section A: General Aptitude – This section com...  
3  Section A: General Aptitude – This section com...  
4  Section A: General Aptitude – This section com...  
5  Make Proper Planning: It’s always good to have...  
6                                     GATE CS, GBlog  
7                                     G

In [3]:
import requests
from bs4 import BeautifulSoup
import json

# Function to scrape syllabus from GeeksforGeeks
def scrape_gate_syllabus():
    url = "https://www.geeksforgeeks.org/gate-cse-syllabus/?utm_source=chatgpt.com"
    
    # Send request to fetch webpage
    headers = {"User-Agent": "Mozilla/5.0"}  # Prevents blocking by the server
    response = requests.get(url, headers=headers)
    
    # Parse HTML using BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")
    
    syllabus_data = []  # List to store structured syllabus data

    # Extract syllabus topics
    syllabus_sections = soup.find_all("h2")  # Assuming main topics are in <h2> tags
    
    for section in syllabus_sections:
        topic = section.text.strip()  # Extract topic name
        subtopics = []
        
        # Find the next <ul> (unordered list) after <h2> for subtopics
        ul = section.find_next("ul")
        if ul:
            subtopics = [li.text.strip() for li in ul.find_all("li")]
        
        syllabus_data.append({
            "Topic": topic,
            "Subtopics": subtopics
        })

    return syllabus_data  # Returns the structured syllabus data

# Function to save data in JSON format
def save_to_json(data, filename="gate_cse_syllabus.json"):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)  # Pretty-print JSON
    
    print(f"✅ Syllabus saved as '{filename}'!")

# Main execution
syllabus = scrape_gate_syllabus()  # Scrape syllabus data
save_to_json(syllabus)  # Save to JSON


✅ Syllabus saved as 'gate_cse_syllabus.json'!


In [6]:
import json

# Load JSON file
with open("gate_cse_syllabus.json", "r", encoding="utf-8") as f:
    data = json.load(f)  # Load as Python dictionary

# Pretty-print JSON
print(json.dumps(data, indent=4))


[
    {
        "Topic": "More previous year Question",
        "Subtopics": [
            "GATE-CS-2015 (Set 1)",
            "GATE-CS-2015 (Set 2)",
            "GATE-CS-2015 (Set 3)",
            "GATE-CS-2014-(Set-1)",
            "GATE-CS-2014-(Set-2)",
            "GATE-CS-2014-(Set-3)",
            "GATE CS 2013",
            "GATE CS 2012",
            "GATE CS 2011",
            "GATE CS 2010",
            "GATE-CS-2009",
            "GATE CS 2008",
            "GATE-CS-2007",
            "GATE-CS-2006",
            "GATE-CS-2004",
            "Gate IT 2005",
            "GATE-CS-2003",
            "GATE-CS-2002",
            "GATE-CS-2001",
            "GATE-CS-2000"
        ]
    },
    {
        "Topic": "GATE CSE Syllabus 2025",
        "Subtopics": [
            "Propositional and First-Order Logic",
            "Sets, Relations, Functions,Partial Orders, and Lattices",
            "Monoids, Groups",
            "Graphs: Connectivity, Matching, Coloring",
            "Com

In [1]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.6.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting sympy==1.13.1 (from torch>=1.11.0->sentence-transformers)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.5.2-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading sentence_transformers-3.4.1

In [3]:
import json
from sentence_transformers import SentenceTransformer

# Load the embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")  # Lightweight and efficient model

# Function to generate embeddings
def generate_embedding(text):
    return model.encode(text).tolist()  # Convert NumPy array to list for JSON storage

# Load JSON data with error handling
try:
    with open("gate_cse_syllabus.json", "r", encoding="utf-8") as f:
        data = json.load(f)
except (FileNotFoundError, json.JSONDecodeError) as e:
    print(f"❌ Error loading JSON file: {e}")
    exit(1)  # Stop execution if file is missing or invalid

# Convert each topic + subtopics to embeddings
for item in data:
    # Ensure the required keys exist
    topic = item.get("Topic", "Unknown Topic")
    subtopics = item.get("Subtopics", [])
    
    # Combine topic and subtopics (handle empty lists)
    text = topic if not subtopics else f"{topic} - {', '.join(subtopics)}"
    
    # Generate embedding
    item["embedding"] = generate_embedding(text)

# Save updated JSON with embeddings
output_file = "gate_syllabus_embeddings.json"
try:
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)
    print(f"✅ JSON successfully converted to embeddings and saved as {output_file}!")
except Exception as e:
    print(f"❌ Error saving JSON file: {e}")


✅ JSON successfully converted to embeddings and saved as gate_syllabus_embeddings.json!


In [9]:
import json

# Load the JSON file
with open("gate_syllabus_embeddings.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Pretty-print JSON

from pprint import pprint
pprint(data)


In [5]:
pip install faiss-cpu sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp312-cp312-win_amd64.whl.metadata (4.5 kB)
Downloading faiss_cpu-1.10.0-cp312-cp312-win_amd64.whl (13.7 MB)
   ---------------------------------------- 0.0/13.7 MB ? eta -:--:--
    --------------------------------------- 0.3/13.7 MB ? eta -:--:--
   --- ------------------------------------ 1.0/13.7 MB 3.1 MB/s eta 0:00:05
   ---- ----------------------------------- 1.6/13.7 MB 2.8 MB/s eta 0:00:05
   ------- -------------------------------- 2.6/13.7 MB 3.6 MB/s eta 0:00:04
   -------- ------------------------------- 2.9/13.7 MB 3.4 MB/s eta 0:00:04
   ----------- ---------------------------- 3.9/13.7 MB 3.6 MB/s eta 0:00:03
   ------------- -------------------------- 4.7/13.7 MB 3.6 MB/s eta 0:00:03
   ---------------- ----------------------- 5.5/13.7 MB 3.5 MB/s eta 0:00:03
   ----------------- ---------------------- 6.0/13.7 MB 3.6 MB/s eta 0:00:03
   -------------------- ------------------- 7.1/13.7 MB 3.6 MB/s eta 0:00:02
   ---

In [7]:
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Load the embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Load JSON file
with open("gate_syllabus_embeddings.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract stored embeddings and topics
embeddings = []
topics = []

for item in data:
    topics.append(item["Topic"])  # Store topics
    embeddings.append(np.array(item["embedding"], dtype="float32"))  # Convert to NumPy array

# Convert embeddings list to a FAISS index
embedding_dim = len(embeddings[0])  # Get embedding size (e.g., 384)
faiss_index = faiss.IndexFlatL2(embedding_dim)  # Create FAISS index
faiss_index.add(np.array(embeddings))  # Add embeddings to FAISS

# 🔹 Step 3: Convert Query to Embedding
query = "Linear Algebra GATE 2024 questions"
query_embedding = model.encode(query).astype("float32")

# 🔹 Step 4: Search for the Closest Match
k = 3  # Number of closest matches to retrieve
distances, indices = faiss_index.search(np.array([query_embedding]), k)

# 🔹 Step 5: Display Matching Topics
print(f"🔍 Closest matches for '{query}':\n")
for i, idx in enumerate(indices[0]):
    print(f"{i+1}. Topic: {topics[idx]} (Distance: {distances[0][i]:.4f})")


🔍 Closest matches for 'Linear Algebra GATE 2024 questions':

1. Topic: Conclusion (Distance: 1.1313)
2. Topic: GATE 2025 Exam Dates (Distance: 1.1783)
3. Topic: GATE 2025 Exam Pattern (Distance: 1.2214)


In [8]:
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Load the embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Load JSON file
with open("gate_syllabus_embeddings.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract stored embeddings and topics
embeddings = []
topics = []

for item in data:
    topics.append(item["Topic"])  # Store topics
    embeddings.append(np.array(item["embedding"], dtype="float32"))  # Convert to NumPy array

# Convert embeddings list to a FAISS index
embedding_dim = len(embeddings[0])  # Get embedding size (e.g., 384)
faiss_index = faiss.IndexFlatL2(embedding_dim)  # Create FAISS index
faiss_index.add(np.array(embeddings))  # Add embeddings to FAISS

# 🔹 Step 3: Convert Query to Embedding
query = "Linear Algebra GATE 2024 questions"
query_embedding = model.encode(query).astype("float32")

# 🔹 Step 4: Search for the Closest Match
k = 3  # Number of closest matches to retrieve
distances, indices = faiss_index.search(np.array([query_embedding]), k)

# 🔹 Step 5: Display Matching Topics
print(f"\n🔍 Closest matches for '{query}':\n")
retrieved_topics = []
retrieved_distances = []

for i, idx in enumerate(indices[0]):
    topic_name = topics[idx]
    topic_distance = distances[0][i]
    
    retrieved_topics.append(topic_name)
    retrieved_distances.append(topic_distance)
    
    print(f"{i+1}. Topic: {topic_name} (Distance: {topic_distance:.4f})")

# 🔹 Step 6: Display Conclusions
print("\n📌 **Conclusion:**")
best_match = retrieved_topics[0]
best_distance = retrieved_distances[0]

if best_distance < 0.5:
    print(f"✅ The best match for your query is **'{best_match}'**, which is highly relevant.")
else:
    print(f"⚠️ The best match **'{best_match}'** is somewhat relevant but may not be a perfect fit.")

# Check if multiple topics are closely related
if len(set(retrieved_topics)) > 1:
    print("🔹 Other related topics that might be helpful:")
    for i in range(1, len(retrieved_topics)):
        print(f"   - {retrieved_topics[i]} (Distance: {retrieved_distances[i]:.4f})")

print("\n🎯 Next Step: You can explore past GATE 2024 questions related to these topics!")



🔍 Closest matches for 'Linear Algebra GATE 2024 questions':

1. Topic: Conclusion (Distance: 1.1313)
2. Topic: GATE 2025 Exam Dates (Distance: 1.1783)
3. Topic: GATE 2025 Exam Pattern (Distance: 1.2214)

📌 **Conclusion:**
⚠️ The best match **'Conclusion'** is somewhat relevant but may not be a perfect fit.
🔹 Other related topics that might be helpful:
   - GATE 2025 Exam Dates (Distance: 1.1783)
   - GATE 2025 Exam Pattern (Distance: 1.2214)

🎯 Next Step: You can explore past GATE 2024 questions related to these topics!
