
### Core steps:
#### 1) text = title + gen ed categories + tags
#### 2) TF–IDF (1–2 grams) + cosine similarity to a user profile query
#### 3) Linear adjustments: +0.2*(GPA−threshold) + 0.15*gened_match
#### 4) Diversify with MMR
#### 5) Return top-N

In [5]:
from __future__ import annotations

import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Optional

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


DATA_FILE = "gened_courses_with_avg_gpa.csv"  


def load_gened_courses(path: str) -> pd.DataFrame:
    """Load and prepare gen ed courses data."""
    try:
        df = pd.read_csv(path)
    except Exception:
        df = pd.read_csv(path, encoding="latin-1")


    df = df.rename(columns={
        'COURSE': 'code_key',
        'TITLE': 'title',
        'Average_GPA': 'gpa'
    })

  
    if 'subject' not in df.columns:
        df[['subject', 'number']] = df['code_key'].str.split(r'\s+', n=1, expand=True)

  
    gened_cols = ['ACP', 'CS', 'COMP1', 'HUM', 'NAT', 'QR', 'SBS']
    def make_gened(row):
        categories = []
        for col in gened_cols:
            if col in row and pd.notna(row[col]) and row[col] != '':
                categories.append(col)
        # Add cultural studies tags
        if 'US' in str(row.get('CS', '')):
            categories.append('US')
        if 'WCC' in str(row.get('CS', '')):
            categories.append('WCC')
        if 'NW' in str(row.get('CS', '')):
            categories.append('NW')
        return ' '.join(categories)

    df['gened'] = df.apply(make_gened, axis=1)

    # Create tags from keywords
    vocab = [
        "humanities", "social", "natural", "arts", "composition", "advanced", "lab",
        "cs", "computer", "programming", "math", "statistics", "finance", "economics",
        "history", "biology", "chemistry", "psychology", "design", "business",
        "communication", "media", "society", "writing", "data", "engineering",
        "science", "literature", "culture", "language", "philosophy", "ethics",
        "environment", "health", "education", "political", "gender", "race",
        "architecture", "music", "dance", "theater", "film", "art"
    ]

    def make_tags(row):
        txt = f"{row.get('title', '')} {row.get('gened', '')}".lower()
        return ", ".join(sorted({t for t in vocab if t in txt}))

    df['tags'] = df.apply(make_tags, axis=1)


    df['title'] = df['title'].fillna('')
    df['gened'] = df['gened'].fillna('')
    df['tags'] = df['tags'].fillna('')
    df['gpa'] = df['gpa'].fillna(3.0)  

    return df


courses = load_gened_courses(DATA_FILE)
print(f"\nLoaded {len(courses)} courses")
print(f"Courses with GPA data: {courses['gpa'].notna().sum()}")
print(f"\nSample courses:")
print(courses[['code_key', 'title', 'gpa', 'gened']].head(10))


Loaded 1420 courses
Courses with GPA data: 1420

Sample courses:
   code_key                                        title   gpa          gened
0   AAS 300      Theories of Race, Gender, and Sexuality  3.00            ACP
1   AAS 310                  Race and Cultural Diversity  3.00      ACP CS US
2   AAS 370                 Immigration, Law, and Rights  3.03            ACP
3   AAS 402                     Asian American Education  3.00      ACP CS US
4   ABE 469                   Capstone Design Experience  3.63            ACP
5    AE 443                  Aerospace Systems Design II  3.56            ACP
6  AFRO 310                  Race and Cultural Diversity  3.00      ACP CS US
7  AFRO 340                Dancing Black Popular Culture  3.00  ACP CS HUM US
8  AFRO 460                 Slavery in the United States  3.00            ACP
9  AFRO 474  The Black Liberation Movement, 1955-Present  3.00            ACP


# Build Text → TF–IDF
Create a text representation of each course combining title, gen ed categories, and tags.

In [6]:
def build_text(df: pd.DataFrame) -> pd.Series:
    """Build text corpus from course data."""
    return (
        df["title"].fillna("").astype(str) + " " +
        df["gened"].fillna("").astype(str) + " " +
        df["tags"].fillna("").astype(str)
    )

def fit_vectorizer(corpus: pd.Series) -> TfidfVectorizer:
    """Fit TF-IDF vectorizer on course corpus."""
    vec = TfidfVectorizer(
        max_df=0.7,
        min_df=2,
        ngram_range=(1, 2),
        stop_words="english"
    )
    vec.fit(corpus)
    return vec

corpus = build_text(courses)
vectorizer = fit_vectorizer(corpus)
X = vectorizer.transform(corpus)
print(f"TF–IDF matrix shape: {X.shape}")
print(f"Number of features (unique terms): {X.shape[1]}")

TF–IDF matrix shape: (1420, 1599)
Number of features (unique terms): 1599


# Recommender Function
Uses cosine similarity, GPA boosting, gen ed matching, and MMR diversification.

In [7]:
def recommend_courses(
    profile: Dict[str, any],
    df: pd.DataFrame,
    X_tfidf,
    vectorizer: TfidfVectorizer,
    topk: int = 20,
    mmr_lambda: float = 0.7
) -> pd.DataFrame:

    # Extract profile parameters
    interests = profile.get("interests", "")
    gened_prefs = profile.get("gened_preferences", [])
    min_gpa = profile.get("min_gpa", 3.0)
    avoid_subjects = profile.get("avoid_subjects", [])

  
    query_vec = vectorizer.transform([interests])
    sims = cosine_similarity(query_vec, X_tfidf).flatten()

   
    gpa_boost = np.where(
        df["gpa"] >= min_gpa,
        0.2 * (df["gpa"] - min_gpa),
        0.0
    )

 
    gened_boost = np.zeros(len(df))
    if gened_prefs:
        for i, row in df.iterrows():
            gened_str = str(row.get("gened", "")).upper()
            matches = sum(1 for pref in gened_prefs if pref.upper() in gened_str)
            gened_boost[i] = 0.15 * matches

    # avoided subjects
    subject_penalty = np.zeros(len(df))
    if avoid_subjects:
        for i, row in df.iterrows():
            subj = str(row.get("subject", "")).upper()
            if subj in [s.upper() for s in avoid_subjects]:
                subject_penalty[i] = -0.5

    # Combined score
    scores = sims + gpa_boost + gened_boost + subject_penalty

    # MMR diversification
    def mmr_diversify(scores, X, topk, lambda_param=0.7):
        """Maximal Marginal Relevance for diversity."""
        selected = []
        candidates = np.arange(len(scores))

     
        first = np.argmax(scores)
        selected.append(first)
        candidates = np.delete(candidates, np.where(candidates == first))

        while len(selected) < topk and len(candidates) > 0:
            # Calculate MMR score for each candidate
            mmr_scores = []
            for c in candidates:
                relevance = scores[c]
              
                if len(selected) > 0:
                    sims_to_selected = cosine_similarity(
                        X[c:c+1], X[selected]
                    ).flatten()
                    max_sim = np.max(sims_to_selected)
                else:
                    max_sim = 0

                mmr = lambda_param * relevance - (1 - lambda_param) * max_sim
                mmr_scores.append(mmr)

            # Select best MMR
            best_idx = np.argmax(mmr_scores)
            best_candidate = candidates[best_idx]
            selected.append(best_candidate)
            candidates = np.delete(candidates, best_idx)

        return selected

    # Get diverse top-k
    top_indices = mmr_diversify(scores, X_tfidf, topk, mmr_lambda)

   
    result = df.iloc[top_indices].copy()
    result["score"] = scores[top_indices]
    result = result.sort_values("score", ascending=False)

    
    display_cols = ['code_key', 'title', 'gened', 'gpa', 'tags', 'score']
    available_cols = [c for c in display_cols if c in result.columns]

    return result[available_cols]



In [4]:

example_profile = {
    "interests": "psychology society culture politics economics behavior",
    "gened_preferences": ["HUM", "CS"],
    "min_gpa": 3.3,
    "avoid_subjects": ["BTW", "CEE"]
}


recs = recommend_courses(
    example_profile,
    courses,
    X,
    vectorizer,
    topk=20,
    mmr_lambda=0.7
)

print("\n" + "="*80)
print("TOP COURSE RECOMMENDATIONS")
print("="*80)
print(f"\nBased on interests: {example_profile['interests']}")
print(f"Preferred gen eds: {', '.join(example_profile['gened_preferences'])}")
print(f"Minimum GPA: {example_profile['min_gpa']}")
print(f"Avoiding subjects: {', '.join(example_profile['avoid_subjects'])}")
print("\n")


for idx, row in recs.head(20).iterrows():
    print(f"{row['code_key']:15} | GPA: {row['gpa']:.2f} | Score: {row['score']:.3f}")
    print(f"  {row['title']}")
    if row['gened']:
        print(f"  Gen Ed: {row['gened']}")
    print()

print("\n" + "="*80)
print("FULL RECOMMENDATIONS TABLE")
print("="*80)
recs


TOP COURSE RECOMMENDATIONS

Based on interests: psychology society culture politics economics behavior
Preferred gen eds: HUM, CS
Minimum GPA: 3.3
Avoiding subjects: BTW, CEE


EPSY 201        | GPA: 3.78 | Score: 0.567
  Educational Psychology
  Gen Ed: SBS

REL 108         | GPA: 3.78 | Score: 0.561
  Religion & Society in West I
  Gen Ed: CS HUM WCC

CLCV 250        | GPA: 3.72 | Score: 0.515
  Sports and Society in Greece and Rome
  Gen Ed: CS HUM WCC

UKR 113         | GPA: 3.79 | Score: 0.513
  Ukrainian Culture
  Gen Ed: CS HUM WCC

REL 208         | GPA: 3.93 | Score: 0.471
  Cultures & Literatures of South Asia
  Gen Ed: CS HUM NW

ACE 255         | GPA: 3.62 | Score: 0.452
  Economics of Food and Environmental Justice
  Gen Ed: CS SBS US

AFRO 228        | GPA: 3.72 | Score: 0.448
  Hip Hop Music: History and Culture
  Gen Ed: CS HUM US

HIST 268        | GPA: 3.50 | Score: 0.441
  Biology and Society from Darwin to the Human Genome
  Gen Ed: CS HUM WCC

CLCV 224        | GP

Unnamed: 0,code_key,title,gened,gpa,tags,score
1400,EPSY 201,Educational Psychology,SBS,3.78,"education, psychology",0.567151
631,REL 108,Religion & Society in West I,CS HUM WCC,3.78,"cs, society",0.560755
502,CLCV 250,Sports and Society in Greece and Rome,CS HUM WCC,3.72,"cs, society",0.514911
655,UKR 113,Ukrainian Culture,CS HUM WCC,3.79,"cs, culture",0.512823
453,REL 208,Cultures & Literatures of South Asia,CS HUM NW,3.93,"cs, culture, literature",0.470885
158,ACE 255,Economics of Food and Environmental Justice,CS SBS US,3.62,"cs, economics, environment",0.452145
174,AFRO 228,Hip Hop Music: History and Culture,CS HUM US,3.72,"cs, culture, history, music",0.448327
586,HIST 268,Biology and Society from Darwin to the Human G...,CS HUM WCC,3.5,"biology, cs, society",0.440865
199,CLCV 224,Greco-Roman Antiquity and US Minority Cultures,CS HUM US,3.8,"cs, culture",0.439937
1048,PS 170,"Power, Politics, and Protest",HUM,3.68,cs,0.429565
