In [25]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Load data
df_cb = pd.read_csv('BrainyDS5.csv')


In [26]:


# Convert 'field of interest' column to lowercase
df_cb['field of interest'] = df_cb['field of interest'].apply(lambda x: x.lower())

# Define TF-IDF vectorizer for content-based filtering
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_cb['field of interest'])

# Compute truncated SVD matrix for content-based filtering
svd = TruncatedSVD(n_components=min(50, tfidf_matrix.shape[1]))
svd_matrix = svd.fit_transform(tfidf_matrix)

# Scale features for content-based filtering
scaler = StandardScaler()
scaled_matrix = scaler.fit_transform(svd_matrix)

# Compute cosine similarity for content-based filtering
cosine_sim = cosine_similarity(scaled_matrix)

# Define collaborative filtering model
df_cf = df_cb[['STUDID','Course Selected', 'Course Rating']]

# Compute mean rating for each course
course_ratings = df_cf.groupby('Course Selected')['Course Rating'].mean().reset_index()

# Merge course ratings with collaborative filtering dataframe
df_cf = pd.merge(df_cf, course_ratings, on='Course Selected', how='left')

# Compute difference between each user's rating and the mean rating for each course
#df_cf['Rating Diff'] = df_cf['Course Rating'] - df_cf['Course Rating_y']
# Compute difference between each user's rating and the mean rating for each course
df_cf['Rating Diff'] = df_cf['Course Rating_x'] - df_cf['Course Rating_y']
# Define function to recommend courses based on user ID
def recommend_courses(user_id, top_n=5):
    # Filter collaborative filtering dataframe to only include user's ratings
    user_ratings = df_cf[df_cf['STUDID'] == user_id][['Course Selected', 'Rating Diff']]
    
    # Compute weighted rating for each course
    user_ratings['Weighted Rating'] = user_ratings['Rating Diff'] * 0.5 + 2.5
    
    # Merge collaborative filtering dataframe with cosine similarity matrix
    df = pd.merge(df_cb, pd.DataFrame(cosine_sim), left_index=True, right_index=True)
    
    # Compute weighted average of cosine similarity scores for each course
    df['Weighted Similarity'] = df.apply(lambda x: x[user_id + 1] * 0.5 + 0.5 if x['STUDID'] == user_id else x[user_id + 1], axis=1)
    
    # Compute final score for each course
    df['Final Score'] = df['Weighted Similarity'] * df['Difficulty Level'] * df['Course Rating'] * df['Modes of Learning'].apply(lambda x: 2 if x == 'Online' else 1) * df['field of interest'].apply(lambda x: user_ratings[user_ratings['Course Selected'] == x]['Weighted Rating'].values[0] if x in user_ratings['Course Selected'].tolist() else 2.5)
    
    # Sort courses by final score and return top N
    return df.sort_values('Final Score', ascending=False).head(top_n)['Course Selected']


In [28]:
df_cf

Unnamed: 0,STUDID,Course Selected,Course Rating_x,Course Rating_y,Rating Diff
0,1,Young App Developer,5,3.691489,1.308511
1,1,Young Entrepreneur,5,3.713514,1.286486
2,1,Young Honcho,1,3.637838,-2.637838
3,1,Artificial Intelligence for Non Programmers,5,3.611111,1.388889
4,2,Tech Innovator,4,3.625000,0.375000
...,...,...,...,...,...
9515,3200,Electronics for Kids,5,3.820690,1.179310
9516,3200,Game Development - Microsoft Arcade,5,3.750000,1.250000
9517,3200,Young Data Scientist,5,3.670213,1.329787
9518,3201,Young Honcho,3,3.637838,-0.637838


In [29]:
recommend_courses(111, top_n=5)

9353    Virtual Robotics & Mechatronics
3996                        Young Coder
9189                    App Development
1911                    C++ Programming
8668                   Drone Technology
Name: Course Selected, dtype: object