# Structured DataFrame from AI/ML Full-Stack Syllabus
Parse the Complete_AI_ML_FullStack_Syllabus_Final.docx and create a hierarchical DataFrame with **Part → Unit → Topic → Sub-topic** structure.

In [None]:
import pandas as pd
from docx import Document
import re

# Load the document
doc = Document(r"D:\Personal_App\AI_APP_V1\Complete_AI_ML_FullStack_Syllabus_Final.docx")

# Known main topic titles (ALL CAPS section headers)
main_topics = [
    "STATISTICS FOR AI & MACHINE LEARNING",
    "MACHINE LEARNING",
    "PYTHON FOR BACKEND DEVELOPMENT",
    "REACT FOR FRONTEND DEVELOPMENT",
    "CLOUD DEPLOYMENT: AWS, GCP & AZURE",
    "ADDITIONAL ESSENTIAL TOPICS",
    "PROJECT MANAGEMENT",
    "PRODUCT ENGINEERING",
    "ADVANCED AI & DATA TECHNOLOGIES",
    "COMPUTER SCIENCE FUNDAMENTALS",
    "DATA PLATFORMS & VISUALIZATION",
    "EXTENDED FULL-STACK & PRACTICAL SKILLS",
    "EMERGING TECHNOLOGIES & FRONTIER SKILLS",
    "GENERATIVE AI & AGENTIC SYSTEMS",
]

# Parse the document into structured records
records = []
current_main_topic = ""
current_unit = ""
current_topic_title = ""  # includes number like "1.1 Meaning, Scope & Role of Statistics"

# Regex for numbered topics (e.g., "1.1 ...", "100.3 ...")
topic_pattern = re.compile(r'^(\d+\.\d+)\s+(.*)')

for para in doc.paragraphs:
    text = para.text.strip()
    if not text:
        continue

    style_name = para.style.name if para.style else "None"

    # Detect Main Topic titles
    if text in main_topics:
        current_main_topic = text
        current_unit = ""
        current_topic_title = ""
        continue

    # Skip "PART X" lines
    if re.match(r'^PART\s+\d+$', text, re.IGNORECASE):
        continue

    # Detect Unit/Module headings (Heading 2 style)
    if style_name == "Heading 2":
        current_unit = text
        current_topic_title = ""
        continue

    # Detect Topics (e.g., "1.1 Meaning, Scope & Role of Statistics")
    topic_match = topic_pattern.match(text)
    if topic_match and style_name != "List Paragraph":
        current_topic_title = text  # Keep full text like "1.1 Meaning, Scope & Role of Statistics"
        continue

    # Detect Sub-topics (List Paragraph style items under a topic)
    if style_name == "List Paragraph" and current_topic_title:
        records.append({
            "Main_Topic": current_main_topic,
            "Unit": current_unit,
            "Topic_Title": current_topic_title,
            "Sub_Topic": text
        })

topic_df = pd.DataFrame(records)
print(f"DataFrame Shape: {topic_df.shape}")
print(f"Main Topics: {topic_df['Main_Topic'].nunique()}")
print(f"Units/Modules: {topic_df['Unit'].nunique()}")
print(f"Topics: {topic_df['Topic_Title'].nunique()}")
print(f"Sub-Topics: {len(topic_df)}")
topic_df.head(10)

DataFrame Shape: (2022, 4)
Main Topics: 14
Units/Modules: 132
Topics: 321
Sub-Topics: 2022


Unnamed: 0,Main_Topic,Unit,Topic_Title,Sub_Topic
0,STATISTICS FOR AI & MACHINE LEARNING,Unit 1: Statistical Foundations,"1.1 Meaning, Scope & Role of Statistics",Definition and scope of statistics as a discip...
1,STATISTICS FOR AI & MACHINE LEARNING,Unit 1: Statistical Foundations,"1.1 Meaning, Scope & Role of Statistics",Descriptive vs. inferential statistics
2,STATISTICS FOR AI & MACHINE LEARNING,Unit 1: Statistical Foundations,"1.1 Meaning, Scope & Role of Statistics",Role of statistics in scientific research and ...
3,STATISTICS FOR AI & MACHINE LEARNING,Unit 1: Statistical Foundations,"1.1 Meaning, Scope & Role of Statistics",Historical evolution from census data to moder...
4,STATISTICS FOR AI & MACHINE LEARNING,Unit 1: Statistical Foundations,1.2 Importance of Statistics in AI & Machine L...,Statistical thinking as the backbone of ML alg...
5,STATISTICS FOR AI & MACHINE LEARNING,Unit 1: Statistical Foundations,1.2 Importance of Statistics in AI & Machine L...,"How statistical concepts drive model training,..."
6,STATISTICS FOR AI & MACHINE LEARNING,Unit 1: Statistical Foundations,1.2 Importance of Statistics in AI & Machine L...,"Statistics in feature engineering, hypothesis ..."
7,STATISTICS FOR AI & MACHINE LEARNING,Unit 1: Statistical Foundations,1.2 Importance of Statistics in AI & Machine L...,Bridging domain knowledge with data through st...
8,STATISTICS FOR AI & MACHINE LEARNING,Unit 1: Statistical Foundations,1.3 Types of Data,Qualitative (categorical) vs. Quantitative (nu...
9,STATISTICS FOR AI & MACHINE LEARNING,Unit 1: Statistical Foundations,1.3 Types of Data,Discrete vs. Continuous variables


In [None]:
# Summary: Main Topics with unit/topic/sub-topic counts
summary = topic_df.groupby("Main_Topic").agg(
    Units=("Unit", "nunique"),
    Topics=("Topic_Title", "nunique"),
    Sub_Topics=("Sub_Topic", "count")
).reset_index()
summary

Unnamed: 0,Main_Topic,Units,Topics,Sub_Topics
0,ADDITIONAL ESSENTIAL TOPICS,5,10,70
1,ADVANCED AI & DATA TECHNOLOGIES,7,34,270
2,"CLOUD DEPLOYMENT: AWS, GCP & AZURE",8,22,157
3,COMPUTER SCIENCE FUNDAMENTALS,5,20,139
4,DATA PLATFORMS & VISUALIZATION,3,11,72
5,EMERGING TECHNOLOGIES & FRONTIER SKILLS,4,13,86
6,EXTENDED FULL-STACK & PRACTICAL SKILLS,4,12,75
7,GENERATIVE AI & AGENTIC SYSTEMS,16,21,84
8,MACHINE LEARNING,30,60,289
9,PRODUCT ENGINEERING,8,19,147


In [None]:
# Save to Excel and CSV
output_excel = r"D:\Personal_App\AI_APP_V1\AI_ML_Syllabus_Structured.xlsx"
output_csv = r"D:\Personal_App\AI_APP_V1\AI_ML_Syllabus_Structured.csv"

topic_df.to_excel(output_excel, index=False, sheet_name="Syllabus")
topic_df.to_csv(output_csv, index=False, encoding="utf-8-sig")

print(f"✅ Excel saved: {output_excel}")
print(f"✅ CSV saved: {output_csv}")

✅ Excel saved: D:\Personal_App\AI_APP_V1\AI_ML_Syllabus_Structured.xlsx
✅ CSV saved: D:\Personal_App\AI_APP_V1\AI_ML_Syllabus_Structured.csv
