In [5]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import bigrams
import re
from collections import defaultdict
import math
import nltk
import ipywidgets as widgets
from IPython.display import display
from sklearn.metrics import ndcg_score

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/someshb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/someshb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
def parse_courses(xml_content):
    soup = BeautifulSoup(xml_content, 'lxml-xml')
    courses = []
    
    for course in soup.find_all('course'):
        code = course.find('code').text.strip().replace('\xa0', ' ') if course.find('code') else ''
        title = course.find('title').text.strip() if course.find('title') else ''
        desc = course.find('description').text.strip() if course.find('description') else ''
        level = int(course.find('level').text.strip()) if course.find('level') else 0
        
        prereqs = [prereq.text.strip().replace('\xa0', ' ') for prereq in course.find_all('prerequisite')]
        coreqs = [coreq.text.strip().replace('\xa0', ' ') for coreq in course.find_all('corequisite')]
        
        hours = int(course.find('hours').text.strip()) if course.find('hours') else 0  # Check for <hours> tag
        restrictions = course.find('restrictions').text.strip() if course.find('restrictions') else ''
        
        courses.append({
            'code': code,
            'title': title,
            'description': desc,
            'prerequisites': prereqs,
            'corequisites': coreqs,
            'level': level,
            'hours': hours,
            'restrictions': restrictions
        })
    
    return courses

try:
    with open('courses.xml', 'r', encoding='utf-8') as file:
        xml_content = file.read()
    courses = parse_courses(xml_content)
    print(f"Successfully parsed {len(courses)} courses.")
except Exception as e:
    print(f"Error loading or parsing course data: {e}")
    courses = []

Successfully parsed 172 courses.


In [7]:
import re

def normalize_prereqs(prereqs):
    normalized = []
    for prereq in prereqs:
        # Split on "and" or "or" for complex prerequisites
        parts = re.split(r'\s*(?:and|or)\s*', prereq, flags=re.IGNORECASE)
        for part in parts:
            # Extract course codes (e.g., "CS 3500")
            match = re.match(r'([A-Z]+\s*\d+)(?:\s*or\s*equivalent)?', part.strip())
            if match:
                normalized.append(match.group(1).replace('\xa0', ' '))
            else:
                # Keep non-standard prereqs as-is (e.g., "Permission of instructor")
                normalized.append(part.strip())
    return list(dict.fromkeys(normalized))  # Remove duplicates while preserving order

def infer_topics(courses):
    topic_keywords = {
        'ai': ['artificial', 'intelligence', 'ai', 'machine learning', 'deep learning', 'reinforcement'],
        'nlp': ['natural language', 'nlp', 'language processing', 'speech', 'text', 'semantic'],
        'programming': ['programming', 'coding', 'software', 'design', 'development', 'object-oriented'],
        'systems': ['system', 'operating', 'distributed', 'network', 'cloud', 'architecture'],
        'theory': ['theory', 'logic', 'computation', 'complexity', 'automata', 'discrete'],
        'algorithms': ['algorithm', 'data structure', 'optimization', 'graph', 'search'],
        'database': ['database', 'sql', 'data management', 'retrieval'],
        'security': ['security', 'cryptography', 'privacy'],
        'graphics': ['graphic', 'rendering', 'visualization', 'game'],
        'hci': ['human-computer', 'interaction', 'interface', 'usability'],
        'lab': ['lab', 'experiment', 'hands-on', 'practical'],  # Strengthened lab keywords
        'data_science': ['data science', 'mining', 'statistics', 'predictive', 'analytics']
    }
    
    for course in courses:
        text = (course['title'] + ' ' + course['description']).lower()
        tokens = process_query(text)
        topics = set()
        
        # Keyword-based topic assignment
        for topic, keywords in topic_keywords.items():
            if any(kw in tokens for kw in keywords):
                topics.add(topic)
        
        # Enhanced lab detection
        if 'lab' in course['title'].lower() or 'experiments' in text or 'hands-on' in text:
            topics.add('lab')
        
        # Context-based rules for electives and seminars
        if 'elective credit' in text or 'repeated' in text or 'research' in text:
            if not topics:
                topics.add('misc')
        if 'seminar' in course['title'].lower() and not topics:
            topics.add('misc')
        if 'directed study' in course['title'].lower() and not topics:
            topics.add('misc')
        
        course['topics'] = list(topics)

for course in courses:
    course['prerequisites'] = normalize_prereqs(course['prerequisites'])
infer_topics(courses)
print("Prerequisites normalized and topics inferred.")

Prerequisites normalized and topics inferred.


In [8]:
synonyms = {
    'ai': ['artificial', 'intelligence', 'ai'],
    'ml': ['machine', 'learning', 'ml'],
    'programming': ['coding', 'development', 'programming'],
    'web': ['website', 'internet', 'web'],
    'data': ['information', 'records', 'data'],
    'vr': ['virtual', 'reality', 'vr', 'virtualreality'],
    'ar': ['augmented', 'reality', 'ar'],
    'cs': ['computer', 'science', 'cs'],
    'it': ['information', 'technology', 'it'],
    'cybersecurity': ['cyber', 'security', 'cybersecurity'],
    'database': ['db', 'data', 'base'],
    'network': ['net', 'work', 'network'],
    'software': ['app', 'application', 'software'],
    'ir': ['information', 'retrieval', 'ir'],
    'hci': ['human', 'computer', 'interaction', 'hci'],
    'graphics': ['graphic', 'design', 'graphics'],
    'algorithms': ['algorithm', 'algorithms'],
    'theory': ['theoretical', 'concepts', 'theory'],
    'systems': ['system', 'infrastructure', 'systems'],
    'nlp': ['natural', 'language', 'processing', 'nlp']
}

In [9]:
import re
def process_query(query):
    tokens = re.findall(r'\b\w+\b', query.lower())
    tokens = [t for t in tokens if t not in stop_words]
    stemmed_tokens = [stemmer.stem(t) for t in tokens]
    all_tokens = list(set(tokens + stemmed_tokens))
    
    # Add bigrams for phrase extraction
    bi_tokens = [' '.join(b) for b in bigrams(tokens)]
    all_tokens.extend([stemmer.stem(bt) for bt in bi_tokens])
    
    # Synonym Expansion
    expanded = []
    for t in all_tokens:
        if t in synonyms:
            expanded.extend(synonyms[t])
        else:
            expanded.append(t)
    
    return list(set(expanded))

# Test
print(process_query("AI programming"))

['program', 'coding', 'programming', 'development', 'intelligence', 'artificial', 'ai', 'ai program']


In [10]:
def build_inverted_index(courses):
    index = {}
    for course in courses:
        text = f"{course['code']} {course['title']} {course['description']}".lower()
        tokens = process_query(text)
        for token in set(tokens):
            if token not in index:
                index[token] = []
            index[token].append(course['code'])
    return index

inverted_index = build_inverted_index(courses)
print("Inverted index built.")

Inverted index built.


In [11]:
def build_bm25_index(courses):
    bm25_index = defaultdict(dict)
    doc_lengths = {}
    df = defaultdict(int)
    N = len(courses)
    
    for course in courses:
        text = f"{course['code']} {course['title']} {course['description']}".lower()
        tokens = process_query(text)
        doc_lengths[course['code']] = len(tokens)
        tf = defaultdict(int)
        for token in tokens:
            tf[token] += 1
        for token in set(tokens):
            df[token] += 1
        bm25_index[course['code']] = dict(tf)
    
    avg_dl = sum(doc_lengths.values()) / N if N > 0 else 0
    return bm25_index, df, doc_lengths, avg_dl

bm25_index, df, doc_lengths, avg_dl = build_bm25_index(courses)
print("BM25 index built.")

BM25 index built.


In [12]:
def build_prereq_graph(courses):
    graph = defaultdict(list)
    code_to_idx = {c['code']: i for i, c in enumerate(courses)}
    for i, course in enumerate(courses):
        for prereq in course['prerequisites'] + course['corequisites']:
            if prereq in code_to_idx:
                graph[code_to_idx[prereq]].append(i)
    N = len(courses)
    pr = [1/N] * N
    d = 0.85
    for _ in range(20):
        new_pr = [0] * N
        for i in range(N):
            inbound = graph[i]
            new_pr[i] = (1 - d) / N + d * sum(pr[j] / len(graph[j]) for j in inbound if len(graph[j]) > 0)
        pr = new_pr
    return {courses[i]['code']: pr[i] for i in range(N)}

prereq_rank = build_prereq_graph(courses)

attribute_index = defaultdict(list)
for course in courses:
    attribute_index['level'].append((course['code'], course['level']))
    attribute_index['hours'].append((course['code'], course['hours']))
    if 'topics' in course:
        for topic in course['topics']:
            attribute_index[topic].append(course['code'])

print("Prerequisite graph and attribute index built.")

Prerequisite graph and attribute index built.


In [13]:
def bm25_score(query_terms, doc_id, bm25_index, df, doc_lengths, avg_dl, k1=1.5, b=0.75):
    score = 0.0
    N = len(doc_lengths)
    
    if doc_id not in bm25_index:
        return 0.0
        
    tf = bm25_index[doc_id]
    dl = doc_lengths.get(doc_id, 0)
    
    for term in query_terms:
        if term in tf:
            idf = max(0, math.log((N - df[term] + 0.5) / (df[term] + 0.5) + 1))
            term_score = idf * (tf[term] * (k1 + 1)) / (tf[term] + k1 * (1 - b + b * dl / avg_dl)) if avg_dl > 0 else 0
            score += term_score
    
    return score

In [14]:
def attribute_score(course, query_terms):
    score = 0
    query_lower = ' '.join(query_terms).lower()
    
    # Boost topical relevance
    if 'topics' in course and any(t in query_lower for t in course['topics']):
        score += 2.0
    # Stronger boost for "lab" in query and course
    if 'lab' in query_lower and ('lab' in course['topics'] or 'lab' in course['title'].lower()):
        score += 4.0  # Increased from 1.0 to prioritize lab courses
    # Extra boost for NLP (from previous fix)
    if 'topics' in course and 'nlp' in query_lower and 'nlp' in course['topics']:
        score += 3.0
    # Penalize generic electives and misc topics
    if 'elective credit' in course['description'].lower() or 'misc' in course.get('topics', []):
        if not any(t in query_lower for t in course.get('topics', [])):
            score -= 1.0
    if 'hours' in course and "beginner" in query_lower:
        score -= course['hours'] * 0.1
    if "masters" in query_lower and course['level'] >= 5000:
        score += 1.0
    return score

def pagerank_score(course):
    return prereq_rank.get(course['code'], 0)

In [15]:
def apply_filters(results, query):
    query_lower = query.lower()
    
    not_masters = any(pattern in query_lower for pattern in ["not master", "not graduate", "no master", "no graduate"])
    not_undergrad = any(pattern in query_lower for pattern in ["not undergrad", "not undergraduate", "no undergrad", "no undergraduate"])
    
    level_filters = {}
    if "master" in query_lower or "masters" in query_lower or "master's" in query_lower or "graduate" in query_lower:
        if not not_masters:
            level_filters["masters"] = lambda c: c['level'] >= 5000
    if "undergrad" in query_lower or "undergraduate" in query_lower:
        if not not_undergrad:
            level_filters["undergraduate"] = lambda c: c['level'] < 5000
    if not_masters:
        level_filters["not_masters"] = lambda c: c['level'] < 5000
    if not_undergrad:
        level_filters["not_undergrad"] = lambda c: c['level'] >= 5000
    
    name_filters = {
        "advanced": lambda c: "advanced" in c['title'].lower() or "advanced" in c['description'].lower(),
        "intro": lambda c: any(word in c['title'].lower() for word in ["intro", "introduction", "introductory"]) or 
                         any(word in c['description'].lower() for word in ["intro", "introduction", "introductory"]),
        "introductory": lambda c: any(word in c['title'].lower() for word in ["intro", "introduction", "introductory"]) or 
                               any(word in c['description'].lower() for word in ["intro", "introduction", "introductory"]),
        "beginner": lambda c: any(word in c['title'].lower() for word in ["beginner", "beginning", "elementary", "fundamental"]) or
                           any(word in c['description'].lower() for word in ["beginner", "beginning", "elementary", "fundamental"]),
        "hands-on": lambda c: "lab" in c['description'].lower() or "practical" in c['description'].lower(),
        "lab": lambda c: "lab" in c['description'].lower()
    }
    
    all_filters = {**level_filters, **name_filters}
    active_filters = []
    filter_terms = set(name_filters.keys()) | {"master", "masters", "master's", "graduate", "undergrad", "undergraduate", "not", "no"}
    
    clean_query = query_lower
    for term in filter_terms:
        clean_query = re.sub(r'\b' + re.escape(term) + r'\b', '', clean_query)
    clean_query = re.sub(r'not\s+\w+', '', clean_query)
    clean_query = re.sub(r'no\s+\w+', '', clean_query)
    clean_query = re.sub(r'\s+', ' ', clean_query).strip()
    
    for filter_func in level_filters.values():
        active_filters.append(filter_func)
    for term, filter_func in name_filters.items():
        if term in query_lower:
            active_filters.append(filter_func)
    
    if not active_filters:
        return results, clean_query
    
    filtered_results = []
    for course in results:
        if all(f(course) for f in active_filters):
            filtered_results.append(course)
    
    return filtered_results, clean_query

In [16]:
def search_courses(query, index_method='Both'):
    query = query.strip()
    if not query:
        return []
    
    results = courses
    results, clean_query = apply_filters(results, query)
    
    if not clean_query.strip():
        return results
    
    processed_terms = process_query(clean_query)
    if not processed_terms:
        return results
    
    combined_scores = defaultdict(float)
    
    if index_method in ['Inverted Index', 'Both']:
        doc_codes = set()
        for term in processed_terms:
            if term in inverted_index:
                doc_codes.update(inverted_index[term])
        filtered_codes = {course['code'] for course in results}
        doc_codes = doc_codes.intersection(filtered_codes)
        for code in doc_codes:
            combined_scores[code] += 1.0
    
    if index_method in ['BM25', 'Both']:
        for course in results:
            code = course['code']
            score = bm25_score(processed_terms, code, bm25_index, df, doc_lengths, avg_dl)
            if score > 0:
                combined_scores[code] += min(5.0, score)
    
    if combined_scores:
        course_dict = {c['code']: c for c in results}
        scored_courses = []
        for code, score in combined_scores.items():
            if code in course_dict:
                course = course_dict[code]
                pr_score = pagerank_score(course)
                attr_score = attribute_score(course, processed_terms)
                final_score = (0.4 * score) + (0.5 * attr_score) + (0.1 * pr_score)  # Adjusted: attr > BM25
                scored_courses.append((course, final_score))
        
        scored_courses.sort(key=lambda x: x[1], reverse=True)
        return [s[0] for s in scored_courses]
    
    return results

In [17]:
from sklearn.metrics import ndcg_score

annotated_queries = {
    "courses with AI": ["CS 5100", "CS 4050", "CS 5047", "CS 7170", "CS 4100", "CS 4150", "CS 5150"],
    "intro programming": ["CS 2500", "CS 2510"],
    "NLP courses": ["CS 6120", "CS 4120", "CS 5100", "CS 7150", "CS 4100"],
    "courses with Lab": ["CS 1101", "CS 2501", "CS 2511", "CS 3501", "CS 5003"],
    "systems courses": ["CS 3650", "CS 3700", "CS 5600", "CS 5610", "CS 6240"],
    "theory courses": ["CS 1800", "CS 2800", "CS 3800", "CS 4800", "CS 5800"],
    "database courses": ["CS 3200", "CS 5200", "CS 6200", "CS 7200"],
    "security courses": ["CS 3740", "CS 4740", "CS 5770", "CS 6740"],
    "graphics courses": ["CS 4300", "CS 4360", "CS 5360", "CS 5540"],
    "advanced algorithms": ["CS 5800", "CS 7800", "CS 6810"],
    "data science": ["CS 6220", "CS 6140", "CS 7290", "DS 5220"],
    "human-computer interaction": ["CS 5340", "CS 6330", "CS 7340"],
    "beginner cs": ["CS 1100", "CS 1200", "CS 2500", "CS 2510"],
    "masters level": ["CS 5100", "CS 5200", "CS 5600", "CS 5800", "CS 6140"],
    "game development": ["CS 5150", "CS 5540", "CS 5850"]
}

def evaluate(query, results):
    true_relevance = [1 if r['code'] in annotated_queries.get(query, []) else 0 for r in results]
    pred_relevance = [1/i for i in range(1, len(results)+1)]
    ndcg = ndcg_score([true_relevance], [pred_relevance]) if true_relevance else 0
    precision_at_10 = sum(true_relevance[:10]) / min(10, len(results)) if results else 0
    return ndcg, precision_at_10

query_input = widgets.Text(placeholder='Enter your query...', description='Query:', disabled=False)
index_selector = widgets.RadioButtons(options=['Inverted Index', 'BM25', 'Both'], description='Index:', value='Both')
max_results = widgets.IntSlider(value=10, min=5, max=50, step=5, description='Max Results:')
submit_button = widgets.Button(description='Search')
output = widgets.Output()

def on_submit(b):
    query = query_input.value
    results = search_courses(query, index_selector.value)
    
    with output:
        output.clear_output()
        print(f"Results for query: '{query}'")
        
        if not results:
            print("No matching courses found.")
        else:
            limit = min(len(results), max_results.value)
            for i, course in enumerate(results[:limit]):
                print(f"{i+1}. {course['code']}: {course['title']}")
                if course['prerequisites']:
                    print(f"   Prerequisites: {', '.join(course['prerequisites'])}")
                if course['corequisites']:
                    print(f"   Corequisites: {', '.join(course['corequisites'])}")
                print(f"   Level: {course['level']}")
                print(f"   Hours: {course['hours']}")
                desc = course['description']
                print(f"   {desc[:100] + '...' if len(desc) > 100 else desc}")
                if 'topics' in course:
                    print(f"   Topics: {', '.join(course['topics'])}")
                print()
            ndcg, precision = evaluate(query, results[:limit])
            print(f"NDCG@10: {ndcg:.3f}")
            print(f"Precision@10: {precision:.3f}")

submit_button.on_click(on_submit)
query_input.on_submit(lambda widget: on_submit(None))
display(query_input, index_selector, max_results, submit_button, output)

  query_input.on_submit(lambda widget: on_submit(None))


Text(value='', description='Query:', placeholder='Enter your query...')

RadioButtons(description='Index:', index=2, options=('Inverted Index', 'BM25', 'Both'), value='Both')

IntSlider(value=10, description='Max Results:', max=50, min=5, step=5)

Button(description='Search', style=ButtonStyle())

Output()

In [18]:
from sklearn.metrics import cohen_kappa_score
import random

# Simulate two annotators with slight disagreements
annotator1 = annotated_queries.copy()
annotator2 = annotated_queries.copy()

# Introduce some random disagreements for realism
for query in annotator2:
    if random.random() < 0.2:  # 20% chance of disagreement
        if annotator2[query]:
            annotator2[query] = annotator2[query][:-1]  # Remove one course
        else:
            annotator2[query].append("CS 9999")  # Add a fake course

def inter_annotator_agreement(query, annotator1, annotator2, all_courses):
    codes = set(c['code'] for c in all_courses)
    a1_labels = [1 if c in annotator1.get(query, []) else 0 for c in codes]
    a2_labels = [1 if c in annotator2.get(query, []) else 0 for c in codes]
    return cohen_kappa_score(a1_labels, a2_labels)

# Compute agreement for all queries
print("Inter-Annotator Agreement (Cohen's Kappa):")
for query in annotated_queries:
    kappa = inter_annotator_agreement(query, annotator1, annotator2, courses)
    print(f"{query}: {kappa:.3f}")

Inter-Annotator Agreement (Cohen's Kappa):
courses with AI: 1.000
intro programming: 1.000
NLP courses: 1.000
courses with Lab: 1.000
systems courses: 1.000
theory courses: 1.000
database courses: 0.854
security courses: nan
graphics courses: 1.000
advanced algorithms: 1.000
data science: 1.000
human-computer interaction: 1.000
beginner cs: 1.000
masters level: 1.000
game development: 1.000


  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

def tfidf_lda_topic_tagging(courses, num_topics=10):
    # Combine title and description for each course
    documents = [f"{c['title']} {c['description']}" for c in courses]
    
    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    tfidf_matrix = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    
    # LDA Topic Modeling
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda_matrix = lda.fit_transform(tfidf_matrix)
    
    # Assign topics to courses
    topic_names = [f"topic_{i}" for i in range(num_topics)]
    for i, course in enumerate(courses):
        topic_dist = lda_matrix[i]
        top_topic_idx = np.argmax(topic_dist)
        course['topics'] = [topic_names[top_topic_idx]]
    
    # Print top words for each topic (for interpretation)
    for i, topic in enumerate(lda.components_):
        top_words = [feature_names[j] for j in topic.argsort()[-5:]]
        print(f"Topic {i}: {', '.join(top_words)}")

# Replace rule-based topics with TF-IDF/LDA
tfidf_lda_topic_tagging(courses)
print("TF-IDF/LDA topics assigned to courses.")

Topic 0: proof, functions, inductive, 5010, structures
Topic 1: player, artificial, services, intelligence, dimensional
Topic 2: offering, formal, object, format, cs
Topic 3: web, mobile, focuses, material, student
Topic 4: natural, game, language, artificial, intelligence
Topic 5: visualization, research, doctoral, work, experience
Topic 6: credit, academic, institutions, taken, elective
Topic 7: science, students, systems, topics, computer
Topic 8: work, selected, thesis, supervisor, agreement
Topic 9: testing, software, algorithms, complexity, design
TF-IDF/LDA topics assigned to courses.
