In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
import os
import pickle

# Load the dataset
dataset_path = r"D:\data\generated_student_recommendations.csv"
df = pd.read_csv(dataset_path)

# Parse list-like strings in 'Past Events' and 'Recommended Events'
df['Past Events'] = df['Past Events'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])
df['Recommended Events'] = df['Recommended Events'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])

# Available events and their domains
events_data = {
    'Event Name': [
        'Hackathon', 'AI Summit', 'Data Science Workshop', 'Blockchain Basics',
        'Tech Talk', 'Startup Pitch', 'Cybersecurity Conference', 'Innovation Summit',
        'AI Research', 'Coding Marathon', 'UI/UX Workshop', 'Smart Agriculture',
        'Cloud Computing Workshop', 'Data Analysis Bootcamp', 'Robotics Expo'
    ],
    'Domain': [
        'Software Development', 'Artificial Intelligence', 'Data Science', 'Blockchain',
        'General Technology', 'Entrepreneurship', 'Cybersecurity', 'Innovation & Research',
        'AI & Machine Learning', 'Programming', 'Design', 'IoT & Smart Systems',
        'Cloud Technology', 'Data Analytics', 'Robotics'
    ]
}

# Domain mapping for student interests to event domains
domain_mapping = {
    'AI/ML': ['Artificial Intelligence', 'AI & Machine Learning'],
    'Web Development': ['Software Development', 'Programming', 'Design'],
    'Cybersecurity': ['Cybersecurity'],
    'IoT': ['IoT & Smart Systems', 'Smart Agriculture'],
    'Blockchain': ['Blockchain'],
    'Embedded Systems': ['Robotics', 'Software Development'],
    'AR/VR': ['Innovation & Research', 'Design'],
    'Cloud Computing': ['Cloud Technology', 'Cloud Computing Workshop']
}

# Create a reverse mapping from event domains to event names
domain_to_events = dict(zip(events_data['Domain'], events_data['Event Name']))

# Feature engineering: Combine Interest Domain and Past Events into a single text feature
def create_student_profile(row):
    profile = row['Interest Domain']
    if row['Past Events']:
        profile += ' ' + ' '.join(row['Past Events'])
    return profile

df['Profile'] = df.apply(create_student_profile, axis=1)

# Vectorize profiles using TF-IDF
tfidf = TfidfVectorizer()
student_tfidf = tfidf.fit_transform(df['Profile'])
event_tfidf = tfidf.transform([domain for domain in events_data['Domain']])

# Compute cosine similarity between student profiles and event profiles
similarity_matrix = cosine_similarity(student_tfidf, event_tfidf)

# Enhanced recommendation function with weighted scoring
def get_enhanced_recommendations(student_idx, cgpa, performance_score, interest_domain):
    # Base eligibility: CGPA >= 7.0 and Performance Score >= 50
    if cgpa < 7.0 or performance_score < 50:
        return []
    
    # Get similarity scores for this student
    sim_scores = similarity_matrix[student_idx]
    
    # Normalize CGPA (5.0-10.0 to 0-1) and Performance Score (30-100 to 0-1)
    cgpa_weight = (cgpa - 5.0) / 5.0
    perf_weight = (performance_score - 30) / 70
    
    # Combined weight (average of CGPA and Performance Score contribution)
    student_weight = (cgpa_weight + perf_weight) / 2
    
    # Adjust similarity scores with student weight
    weighted_scores = sim_scores * student_weight
    
    # Map interest domain to possible event domains
    possible_domains = domain_mapping.get(interest_domain, [interest_domain])
    
    # Get top similar events
    event_indices = weighted_scores.argsort()[::-1]  # Sort in descending order
    recommended_events = []
    
    for idx in event_indices:
        event_domain = events_data['Domain'][idx]
        event_name = domain_to_events[event_domain]
        if event_domain in possible_domains and event_name not in recommended_events:
            recommended_events.append(event_name)
        if len(recommended_events) >= 2:  # Limit to 2 recommendations
            break
    
    return recommended_events if recommended_events else []

# Apply recommendations to the dataset
df['Recommended Events'] = df.apply(
    lambda row: get_enhanced_recommendations(
        row.name, row['CGPA'], row['Performance Score'], row['Interest Domain']
    ), axis=1
)

# Update the Message column based on recommendations
df['Message'] = df['Recommended Events'].apply(
    lambda x: "We recommend attending high-quality events in your preferred domain. Keep up the good work!"
    if x else "Focus on academics first. If you wish to attend an event, consult your class in-charge."
)

# Ensure the output directory exists
output_dir = r"D:\data"
os.makedirs(output_dir, exist_ok=True)

# Save the updated dataset to CSV
output_csv_path = os.path.join(output_dir, "enhanced_student_recommendations.csv")
df.drop(columns=['Profile'], inplace=True)  # Drop temporary column
df.to_csv(output_csv_path, index=False)

# Save the model (TF-IDF vectorizer and mappings) as a .pkl file
model_data = {
    'tfidf_vectorizer': tfidf,
    'domain_to_events': domain_to_events,
    'domain_mapping': domain_mapping,
    'events_data': events_data
}
output_pkl_path = os.path.join(output_dir, "recommendation_model.pkl")
with open(output_pkl_path, 'wb') as f:
    pickle.dump(model_data, f)

print(f"Updated dataset with enhanced recommendations saved to: {output_csv_path}")
print(f"Recommendation model saved to: {output_pkl_path}")
print("\nSample of the updated data (first 5 rows):")
print(df.head())


Updated dataset with enhanced recommendations saved to: D:\data\enhanced_student_recommendations.csv
Recommendation model saved to: D:\data\recommendation_model.pkl

Sample of the updated data (first 5 rows):
  Student ID  CGPA             Past Events   Interest Domain  \
0     S00001  8.94                      []   Cloud Computing   
1     S00002  7.20            [Blockchain]  Embedded Systems   
2     S00003  9.32                 [AI/ML]               IoT   
3     S00004  5.25          [AI/ML, AR/VR]   Web Development   
4     S00005  6.05  [Cloud Computing, IoT]               IoT   

   Performance Score          Recommended Events  \
0                 58  [Cloud Computing Workshop]   
1                 47                          []   
2                 40                          []   
3                 49                          []   
4                 59                          []   

                                             Message  
0  We recommend attending high-quality