In [None]:
from __future__ import annotations

import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Set

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


CLUBS_FILE = "/Users/melinanguyen/Documents/CS124-HONOR/FA25-Group11/Project/data_scraping/raw_data/clubs.csv"
TAGS_FILE = "/Users/melinanguyen/Documents/CS124-HONOR/FA25-Group11/Project/data_scraping/raw_data/clubs_by_tags.csv"


def load_tag_categories(tags_path: str) -> Dict[str, List[str]]:
    """
    Load the official tag categories from clubs_by_tags.csv
    Returns a dict mapping tag names to their associated clubs.
    """
    tags_df = pd.read_csv(tags_path)
    
    # Create mapping of tag -> clubs
    tag_mapping = {}
    for _, row in tags_df.iterrows():
        tag_mapping[row['tag']] = row
    
    return tag_mapping


def merge_related_tags(user_interests: List[str], tag_categories: Dict) -> Set[str]:
    """
    Smart tag merging using actual categories from clubs_by_tags.csv.
    If user mentions 'sports', include both 'Athletic & Recreation' and 'Club Sports'.
    """
    merged_tags = set()
    
   
    interests_str = ' '.join(user_interests).lower()
    
    tag_rules = {
        # Sports-related
        'sports': ['Athletic & Recreation', 'Club Sports'],
        'sport': ['Athletic & Recreation', 'Club Sports'],
        'athletic': ['Athletic & Recreation', 'Club Sports'],
        'fitness': ['Athletic & Recreation', 'Club Sports'],
        'exercise': ['Athletic & Recreation', 'Club Sports'],
        'gym': ['Athletic & Recreation', 'Club Sports'],
        
        # Academic/Professional
        'business': ['Business'],
        'professional': ['Business'],
        'career': ['Business'],
        'engineering': ['Engineering & Mathematics'],
        'math': ['Engineering & Mathematics'],
        
        # Arts/Performance
        'art': ['Media Arts', 'Performance Arts'],
        'music': ['Performance Arts'],
        'dance': ['Performance Arts'],
        'theater': ['Performance Arts'],
        'theatre': ['Performance Arts'],
        'performance': ['Performance Arts'],
        'creative': ['Media Arts', 'Performance Arts'],
        
        # Community/Service
        'volunteer': ['Community Service & Philanthropy'],
        'service': ['Community Service & Philanthropy'],
        'community': ['Community Service & Philanthropy'],
        'philanthropy': ['Community Service & Philanthropy'],
        'social': ['Social & Leisure', 'Community Service & Philanthropy'],
        
        # STEM/Tech
        'technology': ['Technology'],
        'tech': ['Technology'],
        'computer': ['Technology', 'Information & Data Sciences'],
        'programming': ['Technology'],
        'coding': ['Technology'],
        'data': ['Information & Data Sciences'],
        'science': ['Life & Physical Sciences', 'Social & Behavioral Sciences'],
        'stem': ['Technology', 'Engineering & Mathematics', 'Life & Physical Sciences'],
        
        # Culture/Identity
        'culture': ['Identity & Culture'],
        'cultural': ['Identity & Culture'],
        'identity': ['Identity & Culture'],
        'international': ['International'],
        
        # Health/Medical
        'health': ['Health & Wellness', 'Health & Human Sciences'],
        'medical': ['Health & Wellness'],
        'medicine': ['Health & Wellness'],
        'wellness': ['Health & Wellness'],
        
        # Activism/Advocacy
        'activism': ['Advocacy & Activism'],
        'advocacy': ['Advocacy & Activism'],
        'justice': ['Advocacy & Activism'],
        
        # Education
        'education': ['Education', 'Pedagogy & Instruction'],
        'teaching': ['Education', 'Pedagogy & Instruction'],
        
        # Religion/Faith
        'faith': ['Faith', 'Religion & Spirituality'],
        'religion': ['Religion & Spirituality'],
        'spiritual': ['Religion & Spirituality'],
        
        # Greek Life
        'greek': ['Social Fraternities & Sororities'],
        'fraternity': ['Social Fraternities & Sororities'],
        'sorority': ['Social Fraternities & Sororities'],
        
        # Other
        'environment': ['Environmental & Sustainability'],
        'sustainability': ['Environmental & Sustainability'],
        'politics': ['Ideology & Politics'],
        'political': ['Ideology & Politics'],
        'law': ['Law'],
    }
    
  
    for keyword, tags in tag_rules.items():
        if keyword in interests_str:
            merged_tags.update(tags)
    
  
    for interest in user_interests:
        if interest in tag_categories:
            merged_tags.add(interest)
    
    return merged_tags


def load_clubs(path: str) -> pd.DataFrame:
    """Load and prepare club data."""
    try:
        df = pd.read_csv(path)
    except Exception:
        df = pd.read_csv(path, encoding="latin-1")
    
    # Clean and prepare data
    df['title'] = df['title'].fillna('')
    df['mission'] = df['mission'].fillna('')
    df['tags'] = df['tags'].fillna('')
    df['membership_benefits'] = df['membership_benefits'].fillna('')
    df['website'] = df['website'].fillna('')
    
 
    def extract_categories(tag_str):
        if pd.isna(tag_str) or tag_str == '':
            return ''
        
        lines = str(tag_str).split('\n')
        categories = []
        
        for line in lines:
            if 'Student Organization' in line:
                continue
            if ' - ' in line:
                cats = line.split(' - ', 1)[1] if ' - ' in line else line
                categories.append(cats.strip())
        
        return ' '.join(categories)
    
    df['clean_tags'] = df['tags'].apply(extract_categories)
    
    return df


clubs = load_clubs(CLUBS_FILE)
print(f"Loaded {len(clubs)} clubs\n")


tag_categories = load_tag_categories(TAGS_FILE)
print(f"Loaded {len(tag_categories)} tag categories")
print(f"\nAvailable categories: {list(tag_categories.keys())[:10]}...\n")

print("Sample clubs:")
print(clubs[['title', 'clean_tags']].head(10))

Loaded 1134 clubs

Loaded 37 tag categories

Available categories: ['Advocacy & Activism', 'Agricultural', 'Athletic & Recreation', 'Blue Student Organization', 'Business', 'Club Sports', 'Community Service & Philanthropy', 'Education', 'Engineering & Mathematics', 'Environmental & Sustainability']...

Sample clubs:
                                               title  \
0  4 Paws for Ability at the University of Illino...   
1                     4-H House Cooperative Sorority   
2                                            A Space   
3                                  Acacia Fraternity   
4     Academic Buzzer Team: A Quiz Bowl Organization   
5                                    Accounting Club   
6  ACHA Men's Division II Hockey Club at the Univ...   
7            Acoustics at the University of Illinois   
8  ACS Polymer and Polymer Materials Science and ...   
9                                           ActGreen   

                                          clean_tags  
0  Advocac

# Build Text â†’ TFâ€“IDF

In [7]:
def build_text(df: pd.DataFrame) -> pd.Series:
    """Build text corpus from club data."""
    return (
        df["title"].fillna("").astype(str) + " " +
        df["mission"].fillna("").astype(str) + " " +
        df["clean_tags"].fillna("").astype(str)
    )

def fit_vectorizer(corpus: pd.Series) -> TfidfVectorizer:
    """Fit TF-IDF vectorizer on club corpus."""
    vec = TfidfVectorizer(
        max_df=0.7,
        min_df=2,
        ngram_range=(1, 2),
        stop_words="english"
    )
    vec.fit(corpus)
    return vec

corpus = build_text(clubs)
vectorizer = fit_vectorizer(corpus)
X = vectorizer.transform(corpus)
print(f"TFâ€“IDF matrix shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")

TFâ€“IDF matrix shape: (1134, 6672)
Number of features: 6672


# Club Recommender Function

In [8]:
def recommend_clubs(
    profile: Dict[str, any],
    df: pd.DataFrame,
    X_tfidf,
    vectorizer: TfidfVectorizer,
    tag_categories: Dict,
    topk: int = 20,
    mmr_lambda: float = 0.7
) -> pd.DataFrame:
    """
    Recommend clubs based on student profile.
    Uses smart tag merging from clubs_by_tags.csv categories.
    """
    # Extract profile
    interests = profile.get("interests", "")
    preferred_tags = profile.get("preferred_tags", [])
    avoid_tags = profile.get("avoid_tags", [])
    
    # Apply smart tag merging
    all_interests = interests.split() + preferred_tags
    merged_tags = merge_related_tags(all_interests, tag_categories)
    
    print(f"\nðŸŽ¯ User interests: {interests}")
    print(f"ðŸ“‹ User preferred tags: {preferred_tags}")
    print(f"âœ¨ Smart-merged tags: {merged_tags}\n")
    
    # Vectorize interests
    query_vec = vectorizer.transform([interests])
    sims = cosine_similarity(query_vec, X_tfidf).flatten()
    
    # Tag matching boost
    tag_boost = np.zeros(len(df))
    if merged_tags:
        for i, row in df.iterrows():
            club_tags = str(row.get("clean_tags", "")).lower()
            matches = sum(1 for tag in merged_tags if tag.lower() in club_tags)
            tag_boost[i] = 0.2 * matches
    
    # Tag avoidance penalty
    tag_penalty = np.zeros(len(df))
    if avoid_tags:
        for i, row in df.iterrows():
            club_tags = str(row.get("clean_tags", "")).lower()
            penalties = sum(1 for tag in avoid_tags if tag.lower() in club_tags)
            tag_penalty[i] = -0.3 * penalties
    
    # Combined score
    scores = sims + tag_boost + tag_penalty
    
    # MMR diversification
    def mmr_diversify(scores, X, topk, lambda_param=0.7):
        selected = []
        candidates = np.arange(len(scores))
        
        first = np.argmax(scores)
        selected.append(first)
        candidates = np.delete(candidates, np.where(candidates == first))
        
        while len(selected) < topk and len(candidates) > 0:
            mmr_scores = []
            for c in candidates:
                relevance = scores[c]
                if len(selected) > 0:
                    sims_to_selected = cosine_similarity(X[c:c+1], X[selected]).flatten()
                    max_sim = np.max(sims_to_selected)
                else:
                    max_sim = 0
                mmr = lambda_param * relevance - (1 - lambda_param) * max_sim
                mmr_scores.append(mmr)
            
            best_idx = np.argmax(mmr_scores)
            best_candidate = candidates[best_idx]
            selected.append(best_candidate)
            candidates = np.delete(candidates, best_idx)
        
        return selected
    
    # Get diverse recommendations
    top_indices = mmr_diversify(scores, X_tfidf, topk, mmr_lambda)
    
    result = df.iloc[top_indices].copy()
    result["score"] = scores[top_indices]
    result = result.sort_values("score", ascending=False)
    
    display_cols = ['title', 'clean_tags', 'mission', 'website', 'score']
    available_cols = [c for c in display_cols if c in result.columns]
    
    return result[available_cols]


# Example: Sports Enthusiast Profile

In [10]:

sports_profile = {
    "interests": "basketball soccer fitness sports competition team athletics",
    "preferred_tags": ["sports"],  
    "avoid_tags": ["Greek Life"]
}

recs = recommend_clubs(
    sports_profile,
    clubs,
    X,
    vectorizer,
    tag_categories,
    topk=15,
    mmr_lambda=0.7
)

print("=" * 80)
print("TOP CLUB RECOMMENDATIONS")
print("=" * 80)

for idx, row in recs.head(15).iterrows():
    print(f"\n {row['title']}")
    print(f"   Score: {row['score']:.3f}")
    if row['clean_tags']:
        tags_display = row['clean_tags'][:80] + "..." if len(row['clean_tags']) > 80 else row['clean_tags']
        print(f"   Categories: {tags_display}")
    print(f"   {row['website']}")

recs[['title', 'clean_tags', 'score']]


ðŸŽ¯ User interests: basketball soccer fitness sports competition team athletics
ðŸ“‹ User preferred tags: ['sports']
âœ¨ Smart-merged tags: {'Club Sports', 'Athletic & Recreation'}

TOP CLUB RECOMMENDATIONS

 Illinois Men's Soccer Club
   Score: 0.710
   Categories: Athletic & Recreation, Club Sports
   https://one.illinois.edu/IllinoisMensSoccerClub/

 Illinois Men's Club Basketball
   Score: 0.685
   Categories: Athletic & Recreation, Club Sports
   https://one.illinois.edu/IllinoisMensClubBasketball/

 Illinois Women's Club Basketball
   Score: 0.662
   Categories: Athletic & Recreation, Club Sports
   https://one.illinois.edu/IllinoisWomensClubBasketball/

 Illini Women's Soccer Club
   Score: 0.655
   Categories: Athletic & Recreation, Club Sports
   https://one.illinois.edu/IlliniWomensSoccerClub/

 Menace Ultimate Frisbee
   Score: 0.529
   Categories: Athletic & Recreation, Club Sports
   https://one.illinois.edu/MenaceUltimateFrisbee/

 Illini Ridgebacks Quadball Team
   Sco

Unnamed: 0,title,clean_tags,score
554,Illinois Men's Soccer Club,"Athletic & Recreation, Club Sports",0.709961
550,Illinois Men's Club Basketball,"Athletic & Recreation, Club Sports",0.684754
585,Illinois Women's Club Basketball,"Athletic & Recreation, Club Sports",0.661574
524,Illini Women's Soccer Club,"Athletic & Recreation, Club Sports",0.655445
718,Menace Ultimate Frisbee,"Athletic & Recreation, Club Sports",0.529098
495,Illini Ridgebacks Quadball Team,"Athletic & Recreation, Club Sports, Social & L...",0.519135
523,Illini Women's Hockey Club,"Athletic & Recreation, Club Sports",0.513554
536,Illinois Club Baseball,"Athletic & Recreation, Club Sports",0.496502
555,Illinois Men's Volleyball Club,"Athletic & Recreation, Club Sports",0.492723
447,Illini Badminton Intercollegiate Sports Club,"Athletic & Recreation, Club Sports",0.483499
