In [324]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import logging
import dspy
import os
from dotenv import load_dotenv

In [325]:
# Show all rows
pd.set_option('display.max_rows', None)

# Show all columns
pd.set_option('display.max_columns', None)

# Prevent column width truncation
pd.set_option('display.max_colwidth', None)  # or use pd.set_option('display.max_colwidth', -1) in older versions

# Prevent overall frame width truncation
pd.set_option('display.width', None)

In [326]:
logger = logging.getLogger(__name__)
logging.basicConfig(filename='example.log', encoding='utf-8', level=logging.ERROR)

In [327]:
df = pd.read_csv('ch.csv')
links = df[df['Content'].str.contains('http',na=False)]

In [328]:
filtered = links[~links['Content'].str.contains('youtu|.pdf', na=False)]
filtered.reset_index(inplace=True)

In [329]:
def scrape_data(index,url):
    try:
        data = requests.get(url).text
        soup = BeautifulSoup(data)
    except Exception as e:
        logger.error(f"Could not parse URL index {index} : {url}\nError_message: {e} \n")
        return ''
    return soup

In [330]:
data = []
for index,i in enumerate(filtered.Content) :
    data.append(scrape_data(index,i))


Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(data)


In [331]:
text = []
for i in data:
    try:
        text.append(i.text.replace('\n',''))
    except Exception as e:
        logger.error(f"Couldnt parse data from the website:\n Error_message:{e} ")

In [332]:
load_dotenv()


True

In [333]:
lm = dspy.LM("groq/qwen/qwen3-32b", api_key= os.getenv("GROQ_API_KEY"))
dspy.configure(lm=lm)

In [334]:
from typing import Literal


class ResourceDetails(dspy.Signature):
    """Extract detailed information from a technology-related content resource (e.g., blog, video, tutorial, research paper)."""
    
    resource_text: str = dspy.InputField()

    title: str = dspy.OutputField(desc="Title of the content resource")
    author_or_creator: str = dspy.OutputField(desc="Name of the author, creator, or publisher")
    content_type: Literal['Video', 'Blog', 'Research Paper', 'Tutorial','Roadmap', 'Other'] = dspy.OutputField(desc="Type of the content")
    
    tech_field: str = dspy.OutputField(desc="Main field of technology: e.g., AI, Data Science, Web Dev, Cloud, etc.")
    subfield: str = dspy.OutputField(desc="Subfield of the main tech domain: e.g., NLP, LLM, Vision, Backend, Frontend, MLOps, etc.")
    
    programming_languages: str = dspy.OutputField(desc="Programming languages mentioned or used in the content (e.g., Python, JavaScript, Rust, etc.)")
    technical_tools_and_libraries: str = dspy.OutputField(desc="Tools, frameworks, and libraries referenced (e.g., TensorFlow, PyTorch, React, Docker)")
    
    summary: str = dspy.OutputField(desc="3-line concise summary of the resource’s purpose and content")
    
    difficulty_level: Literal['Beginner', 'Intermediate', 'Advanced'] = dspy.OutputField(desc="Target skill level required for understanding the content")
    publication_date: str = dspy.OutputField(desc="When the resource was published or created")
    estimated_time_to_complete: str = dspy.OutputField(desc="Estimated time to watch/read/complete the resource")
    
    tags: str = dspy.OutputField(desc="Relevant tags or keywords for the content")
    resource_link: str = dspy.OutputField(desc="URL or location to access the resource")

    notable_insights: str = dspy.OutputField(desc="Unique ideas, breakthroughs, or standout techniques introduced in the resource")


In [335]:
resource_parser = dspy.ChainOfThought(ResourceDetails)

In [336]:
from transformers import pipeline
import numpy as np
classifier = pipeline("zero-shot-classification", "r-f/ModernBERT-large-zeroshot-v1")


Device set to use cpu


In [337]:
def classify_accessibility(text):
    candidate_labels = ["accessible","not_accessible"]
    output = classifier(text, candidate_labels, multi_label=False)
    return output['labels'][np.argmax(output['scores'])]


In [338]:
import pandas as pd
import time
first_iter = True
for t in text:
    if classify_accessibility(t) == 'accessible':
        try:
            data = resource_parser(resource_text=t)
        except Exception as e:
            logger.error(f"failed to parse data {e}")
            continue 
        time.sleep(15)
        row_df = pd.DataFrame([data.toDict()])  
        if first_iter:
            df = row_df
            first_iter = False
        else:
            df = pd.concat([df, row_df], ignore_index=True)  

df.to_csv('resources.csv', index=False)




In [339]:
dt = pd.read_csv('resources.csv')

In [340]:
dt

Unnamed: 0,reasoning,title,author_or_creator,content_type,tech_field,subfield,programming_languages,technical_tools_and_libraries,summary,difficulty_level,publication_date,estimated_time_to_complete,tags,resource_link,notable_insights
0,"The resource is a Java and Object-Oriented Programming (OOP) cheat sheet created by son9912. It covers core Java syntax, OOP concepts, data types, operators, control structures, string handling, input methods, access modifiers, abstract classes, interfaces, encapsulation, inheritance, and collections. The content is structured for quick reference, with practical examples and explanations tailored for learners. The publication date is explicitly stated as September 25th, 2017, and the resource is categorized as a cheat sheet, which falls under the ""Other"" content type. The technical depth and examples suggest an intermediate difficulty level.",Java + OOP Concept Cheat Sheet,son9912,Other,Java,Object-Oriented Programming (OOP),Java,"java.util.Scanner, java.lang.Math, java.util.Collection (HashSet, LinkedList, HashMap)","This cheat sheet provides a concise reference for Java programming fundamentals and OOP concepts. It includes syntax examples for variables, loops, conditionals, string manipulation, input handling, access modifiers, abstract classes, interfaces, and collection frameworks. Designed for quick lookup, it emphasizes practical code snippets and memory optimization techniques like the String Pool. Targeted at learners transitioning from basics to intermediate Java development.",Intermediate,"25th September, 2017",Not specified (likely 15-30 minutes for review),"Java, OOP, programming, cheat sheet, object-oriented, collections, access modifiers, inheritance, encapsulation",https://cheatography.com/son9912/cheat-sheets/java-oop-concept/,"- Practical examples for Java syntax and OOP principles.\n- Explanation of memory optimization via String Pool.\n- Clear differentiation between abstract classes and interfaces.\n- Coverage of collection frameworks (HashSet, LinkedList, HashMap) with usage examples.\n- Emphasis on naming conventions and access modifiers for code readability."
1,"The resource is a structured learning roadmap for mastering Data Structures and Algorithms (DSA) provided by GeeksforGeeks. It outlines a 5-step process, including language prerequisites, core concepts, libraries, and problem-solving strategies. The content is organized into sequential phases, targeting beginners and emphasizing practical implementation. Key details like programming languages, tools, and subfields are explicitly mentioned in the text.",Complete Roadmap To Learn DSA,GeeksforGeeks,Roadmap,Data Structures and Algorithms,"Data Structures, Algorithms","Python, Java, C++, JavaScript","Standard Template Library (STL) for C++, Java's java.util package, Python's collections and heapq modules","This roadmap provides a structured 5-step guide to learn DSA, starting with programming language fundamentals, progressing to core DSA concepts, and culminating in advanced problem-solving. It emphasizes practical implementation, complexity analysis, and leveraging built-in libraries for efficiency. Designed for beginners, it prepares learners for technical interviews and real-world applications.",Beginner,"27 Jul, 2025",Not explicitly specified (self-paced),"DSA, Data Structures, Algorithms, Learning Roadmap, Technical Interview Prep, Programming Fundamentals",https://www.geeksforgeeks.org/complete-roadmap-to-learn-dsa/,"- Structured 5-step approach: language basics → logic building → DSA fundamentals → libraries → advanced problem-solving.\n- Emphasis on complexity analysis (Big O, Omega, Theta notations).\n- Language-specific prerequisites for C, C++, Java, Python, and JavaScript.\n- Integration of practical problem-solving via platforms like GeeksforGeeks and the SDE Sheet for interview preparation."
2,"The resource is a comprehensive JavaScript tutorial titled ""The Modern JavaScript Tutorial,"" covering both foundational and advanced topics. It is structured into three main parts: the JavaScript language, browser interactions, and additional thematic articles. The content is designed for learners starting from scratch and progressing to advanced concepts like OOP, closures, and async programming. The author is Ilya Kantor, as indicated by the copyright notice. The tutorial is open-source and multilingual, emphasizing accessibility. No specific tools or libraries are highlighted, but the focus is on core JavaScript and browser APIs. The publication date is listed as August 5, 2025, though this may be a typo. Difficulty is ""Beginner"" due to its step-by-step approach.",The Modern JavaScript Tutorial,Ilya Kantor,Tutorial,Web Development,JavaScript,JavaScript,None explicitly mentioned,"A structured tutorial covering JavaScript fundamentals to advanced topics like OOP, closures, and async programming. Includes browser DOM manipulation, events, and additional thematic articles. Designed for learners from scratch to advanced levels.",Beginner,"August 5, 2025",Not specified,"JavaScript, Web Development, DOM, OOP, Async Programming, Browser APIs",https://javascript.info (assumed based on context),"Structured three-part approach (language, browser, additional topics), emphasis on core JavaScript principles, and browser-specific interfaces. Highlights accessibility through multilingual support and open-source availability."
3,"The resource describes a free online Java programming course offered by the University of Helsinki. It is structured into two main parts (Java Programming I and II), each with multiple sections and exercises. The course emphasizes foundational programming concepts, object-oriented programming, and uses industry-standard tools like an IDE. The content is categorized as a course, but since ""Course"" is not an explicit option in the content_type field, ""Other"" is selected. The technical tools include an IDE and the Test My Code platform. The difficulty level is Beginner due to the lack of prerequisites. Publication date is unspecified, but the course is marked as legacy. Tags and notable insights are derived from the course description and structure.",Java Programming,University of Helsinki (Agile Education Research Group),Other,Programming,Object-Oriented Programming (OOP),Java,"Test My Code, Integrated Development Environment (IDE)","A free online course teaching Java programming fundamentals, algorithms, and object-oriented programming. It includes structured lessons, exercises, and industry-grade tools. Designed for beginners with no prior programming knowledge required.",Beginner,Legacy (exact date unspecified),"Approximately 5-20 hours per part (14 parts total), with a recommendation of 10 hours per part.","Java, Programming, OOP, Object-Oriented Programming, MOOC, Test My Code, IDE",https://www.mooc.fi/en/profile/completions,"- Uses industry-grade IDE from day one, avoiding browser-based or educational IDEs. \n- Comprehensive materials with automatic testing via Test My Code. \n- Structured into two courses (I and II) with 14 parts total, mirroring university-level programming courses. \n- No prerequisites required, making it accessible to absolute beginners."
4,"The resource is a blog post titled ""Best Free Resources to Learn Golang – Go Coding Courses"" by Avdhoot Fulsundar. It lists five free resources for learning Golang, including courses, interactive tutorials, and project-based learning. The content focuses on Golang programming, covering fundamentals, projects, and tools. The target audience is beginners, with no prior Go knowledge required. The publication date is October 11, 2022, and the content type is a blog. Technical tools mentioned include Go (Golang) itself, but no specific frameworks or libraries are highlighted beyond the language. The summary emphasizes free learning paths and hands-on projects.",Best Free Resources to Learn Golang – Go Coding Courses,Avdhoot Fulsundar,Blog,Programming,Golang (Go),Go (Golang),None explicitly mentioned beyond Go language itself,"This blog post highlights five free resources to learn Golang, including project-based courses (FreeCodeCamp, Gophercises), interactive tutorials (StudyTonight), and official guides (Effective Go). It emphasizes hands-on learning through 20+ projects, environment setup, and core concepts like concurrency and standard libraries. Ideal for beginners with no prior Go experience.",Beginner,"October 11, 2022",Variable (depends on learner's pace),"Golang, Go, freeCodeCamp, StudyTonight, Effective Go, Traversy Media, Gophercises, programming tutorials, free resources",https://www.freecodecamp.org/news/best-free-resources-to-learn-golang/,- Project-based learning with 11+ projects in FreeCodeCamp's course\n- Interactive browser-based learning via StudyTonight\n- Official Go guide (Effective Go) for syntax and best practices\n- Hands-on practice with Gophercises (20+ projects)\n- Crash course for absolute beginners (Traversy Media)
5,"The resource text describes ""The Markdown Guide,"" a free and open-source reference for learning Markdown syntax. It is structured as a guide with sections for beginners and advanced users, focusing on teaching Markdown formatting. The content type is categorized as ""Other"" since it is a reference guide rather than a video, blog, etc. The tech field is ""Documentation"" as Markdown is primarily used for documentation purposes. No specific programming languages are required, but Markdown itself is the tool being taught. The summary highlights its purpose as a comprehensive resource for Markdown, suitable for all skill levels.",Markdown Guide,Matt Cone,Other,Documentation,Markdown Syntax,,Markdown,"The Markdown Guide is a free, open-source resource teaching Markdown syntax for formatting documents. It includes beginner tutorials, advanced syntax, and practical tools. Designed for all skill levels, it emphasizes simplicity and web writing efficiency.",Beginner,2025,Not specified,"Markdown, Documentation, Syntax, Writing Tools, Open Source",,"The guide emphasizes structured learning through a progressive approach, combining basic syntax with extended features like tables and code blocks. Its open-source nature and CC BY-SA 4.0 license encourage community contributions and adaptability for diverse use cases."
6,"The resource is the official book for the Rust programming language, authored by key contributors and maintained by the Rust community. It provides comprehensive guidance on Rust programming, including installation, syntax, and best practices. The content type is categorized as ""Other"" since it's a book, not a video, blog, etc. The technical field is Rust programming, with subfields in systems programming and safe concurrency. The publication date is inferred from the Rust version mentioned (1.85.0, released 2025-02-17).",The Rust Programming Language,"Steve Klabnik, Carol Nichols, Chris Krycho, and the Rust Community",Other,Systems Programming,"Rust, Safe Concurrency, Memory Safety",Rust,"Cargo, Rust Compiler, Rustup","Official guide to the Rust programming language, covering installation, syntax, and idiomatic practices. Assumes Rust 1.85.0+ and the 2024 edition. Available online, in print, and as an interactive version with quizzes and visualizations.",Intermediate,2025-02-17,Varies (comprehensive reference),"Rust, Systems Programming, Memory Safety, Concurrency, Cargo",https://doc.rust-lang.org/stable/book/,- Emphasis on Rust 2024 edition idioms\n- Interactive learning option via https://rust-book.cs.brown.edu\n- Community-driven translations and contributions
7,"The resource is a tutorial on learning wgpu, a Rust implementation of the WebGPU API. It explains the purpose of wgpu, its relation to WebGPU and low-level graphics APIs, and outlines tutorial sections covering dependencies, rendering concepts, and advanced topics. The content is structured as a step-by-step guide with prerequisites and acknowledgments.",Learn Wgpu,Author not explicitly stated in the text; contribution is accepted via the GitHub repository (linked in the text).,Tutorial,Graphics Programming,WebGPU,Rust,"wgpu, WebGPU, Vulkan, DirectX, Metal","This tutorial introduces wgpu, a Rust implementation of the WebGPU API, explaining its purpose and relationship to low-level graphics APIs. It covers setup, rendering concepts like pipelines and textures, and advanced topics such as lighting and HDR rendering. Assumes prior Rust knowledge and familiarity with Cargo.",Intermediate,7/3/2025,Not specified,"Rust, WebGPU, Graphics Programming, wgpu, Vulkan, DirectX, Metal","GitHub repo (link not explicitly provided in text, but referenced in contribution section)",- wgpu's cross-platform support via translation to DirectX/Metal/Vulkan\n- Emphasis on Rust's safety and convenience for GPU programming\n- Structured tutorial progression from beginner to intermediate topics
8,The provided resource_text is an error message from Google Docs indicating that access requires enabling cookies or signing in. There is no actual content or technical information to extract. The fields are filled based on the context of the error message and the lack of substantive content.,Google Docs Access Error: Cookies or Sign-In Required,Google,Other,Web Services,Authentication,,,"This is an access restriction notice from Google Docs, requiring users to enable cookies or sign in to their Google Account to view content. No technical content or educational material is provided.",Beginner,Unknown,,"Google Docs, Authentication, Error Message, Access Control",https://docs.google.com (example placeholder),The message highlights common web authentication practices and cookie policies but does not provide technical insights or educational value.
9,"The resource is a comprehensive guide on Spigot plugin development from the SpigotMC community wiki. It covers creating plugins using Java and Kotlin, setup with IDEs like IntelliJ and Eclipse, build tools (Gradle/Maven), database integration, event handling, and debugging. The content is structured as tutorials and references for developers of all skill levels, with a focus on practical implementation.",Spigot Plugin Development,SpigotMC,Tutorial,Game Development,Plugin Development,"Java, Kotlin","Gradle, Maven, Eclipse, IntelliJ IDEA, MongoDB, MySQL, WorldEdit API, Javadoc, WatchWolf, Morphia","A step-by-step tutorial series for creating Minecraft Spigot/Bukkit plugins. Covers plugin setup, command creation, event handling, database integration, debugging, and advanced topics like NMS version compatibility. Includes IDE-specific guides and external libraries integration.",Beginner,"Feb 3, 2024",Varies (multi-section tutorial),"Spigot, Bukkit, Java, Kotlin, Plugin Development, Minecraft, Gradle, Maven, Game Development",https://www.spigotmc.org/wiki/spigot-plugin-development/,- Detailed Gradle/Maven build configuration guides\n- Cross-IDE support (Eclipse/IntelliJ/VS Code)\n- Database integration tutorials (MySQL/MongoDB)\n- Version-specific NMS compatibility notes\n- Debugging techniques for different IDEs\n- Kotlin plugin development integration
