In [13]:
import os
import re
from collections import Counter, defaultdict

In [14]:
ROOT_PATH = r"C:\Users\BalasubramanianPG\Videos\Obsidian Vault\3. Interview Prep\Stratascratch"

# Canonical family normalization

In [15]:
FAMILY_CANONICAL_MAP = {
    "Aggregation Methods": "Aggregations",
    "Aggregate Functions": "Aggregations",
    "Aggregations": "Aggregations",

    "Boolean Indexing": "Filtering",
    "Filtering Data": "Filtering",

    "Column Manipulation": "Column Operations",
    "DataFrame Operations": "Column Operations",

    "Data Cleaning": "Data Cleaning",
    "Conditional Logic": "Conditional Logic",
}

DIFFICULTY_MAP = {"Easy", "Medium", "Hard", "Advanced"}

# HELPERS

In [16]:
def clean_yaml_value(value: str) -> str:
    """
    Normalize YAML scalar values:
    - Strip quotes
    - Remove escaped quotes
    - Trim whitespace
    """
    value = value.strip()
    value = value.strip('"').strip("'")
    value = value.replace('\\"', '').replace("\\'", '')
    return value

In [17]:
def extract_yaml_front_matter(content: str) -> str | None:
    match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
    return match.group(1) if match else None

In [18]:
def normalize_family(family: str) -> str:
    return FAMILY_CANONICAL_MAP.get(family, family)

In [19]:
def infer_difficulty_from_path(path: str) -> str:
    for part in path.split(os.sep):
        if part in DIFFICULTY_MAP:
            return part
    return "Unknown"

# Data Collection

In [20]:
topic_data = []

for current_root, _, files in os.walk(ROOT_PATH):
    for filename in files:
        if not filename.endswith(".md"):
            continue

        file_path = os.path.join(current_root, filename)

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        yaml_content = extract_yaml_front_matter(content)
        if not yaml_content:
            continue

        family_match = re.search(
            r'^topic_family:\s*(.+)$',
            yaml_content,
            re.MULTILINE
        )

        function_match = re.search(
            r'^topic_functions?:\s*(.+)$',   # supports topic_function & topic_functions
            yaml_content,
            re.MULTILINE
        )

        raw_family = clean_yaml_value(family_match.group(1)) if family_match else "UNKNOWN_FAMILY"
        raw_function = clean_yaml_value(function_match.group(1)) if function_match else "UNKNOWN_FUNCTION"

        family = normalize_family(raw_family)
        function = raw_function

        difficulty = infer_difficulty_from_path(current_root)

        topic_data.append({
            "filename": filename,
            "path": current_root,
            "difficulty": difficulty,
            "family": family,
            "function": function
        })

## Analysis

In [21]:
family_counter = Counter(item["family"] for item in topic_data)
function_counter = Counter(item["function"] for item in topic_data)
difficulty_counter = Counter(item["difficulty"] for item in topic_data)
combo_counter = Counter(
    (item["family"], item["function"], item["difficulty"])
    for item in topic_data
)

## Output

In [22]:
print("\nTopic Family Counts:")
print("-" * 30)
for family, count in family_counter.most_common():
    print(f'{family}: {count}')

print("\nTopic Function Counts:")
print("-" * 30)
for func, count in function_counter.most_common():
    print(f'{func}: {count}')

print("\nDifficulty Distribution:")
print("-" * 30)
for diff, count in difficulty_counter.items():
    print(f'{diff}: {count}')

print("\nTotal Files Processed:", len(topic_data))
print("Unique Topic Families:", len(family_counter))
print("Unique Topic Functions:", len(function_counter))

print("\nFamily → Function → Difficulty Breakdown:")
print("-" * 45)
for (family, func, diff), count in combo_counter.most_common():
    print(f'{family} → {func} → {diff}: {count}')



Topic Family Counts:
------------------------------
- "Aggregate Functions: 268
- "Aggregations: 64
- "Aggregation Methods: 62
- "Boolean Indexing: 35
- "Data Cleaning: 15
- "Column Manipulation: 13
- "Conditional Logic: 4
UNKNOWN_FAMILY: 3
- "Data Retrieval Basics: 2
- "Filtering Data: 2
- "DataFrame Operations: 2
- "Combining: 1
- "Distinct Counts: 1
- "Array Operation: 1
- "DataFrame Functions: 1
- "DataFrame Creation: 1

Topic Function Counts:
------------------------------
- "agg(): 114
- "alias(): 74
- "arrange(): 57
- "apply(): 31
- "col(): 26
- "avg(): 18
- "case when: 17
- "count(): 16
- "cast(): 9
- "as.Date(): 8
- "append(): 8
- "abs(): 7
- "anti_join(): 7
- "astype(): 6
- "as.integer(): 5
- "between(): 5
- "coalesce(): 4
- "and: 4
- "assign(): 4
- "distinct(): 4
- "bind_rows(): 3
- "ave(): 3
- "as.character(): 3
UNKNOWN_FUNCTION: 3
- "acos(): 2
- "array_length(): 2
- "group_by(): 2
- "as.POSIXct(): 2
- "contains(): 2
- "aggregate(): 2
- "add_prefix(): 2
- "filter(): 2
- "a