In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
ratings = pd.read_csv('ratings (2).csv')

In [None]:
courses = pd.read_csv('courses.csv')

In [None]:
courses.head()

In [None]:
ratings.head()

In [None]:
n_ratings = len(ratings)
n_courses = ratings['courseId'].nunique()
n_users = ratings['userId'].nunique()

print(f"Number of ratings: {n_ratings}")
print(f"Number of unique courses: {n_courses}")
print(f"Number of unique users: {n_users}")
print(f"Average number of ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average number of ratings per course: {round(n_ratings/n_courses, 2)}")

In [None]:
sns.countplot(x='rating', data=ratings)
plt.title("Distribution of course ratings", fontsize=14)
plt.show()

In [None]:
print(f"Mean global rating: {round(ratings['rating'].mean(),2)}.")

In [None]:
mean_ratings = ratings.groupby('userId')['rating'].mean()
print(f"Mean rating per user: {round(mean_ratings.mean(),2)}.")

In [None]:
ratings['courseId'].value_counts()

In [None]:
course_ratings = ratings.merge(courses, on='courseId')
# course_ratings
course_ratings['Course'].value_counts()[0:10]

In [None]:
mean_ratings = ratings.groupby('courseId')[['rating']].mean()
lowest_rated = mean_ratings['rating'].idxmin()
# mean_ratings
courses[courses['courseId'] == lowest_rated]

In [None]:
highest_rated = mean_ratings['rating'].idxmax()

courses[courses['courseId'] == highest_rated]

In [None]:
ratings[ratings['courseId']==highest_rated]

In [None]:
course_stats = ratings.groupby('courseId')['rating'].agg(['count', 'mean'])
course_stats.head()

In [None]:
C = course_stats['count'].mean()
m = course_stats['mean'].mean()

print(f"Average number of ratings for a given course: {C:.2f}")
print(f"Average rating for a given course: {m:.2f}")

def bayesian_avg(ratings):
    bayesian_avg = (C*m + ratings.sum())/(C+ratings.count())
    return round(bayesian_avg, 3)

In [None]:
# these are the two ratings for highest rated course
highest_rated = pd.Series([5,5])
bayesian_avg(highest_rated)

In [None]:
bayesian_avg_ratings = ratings.groupby('courseId')['rating'].agg(bayesian_avg).reset_index()
bayesian_avg_ratings.columns = ['courseId', 'bayesian_avg']
course_stats = course_stats.merge(bayesian_avg_ratings, on='courseId')

In [None]:
course_stats = course_stats.merge(courses[['courseId', 'Course']])
# course_stats.sort_values()

In [None]:
course_stats.sort_values(by='bayesian_avg')

In [None]:
# top 5 highest rated courses
course_stats.sort_values(by='bayesian_avg', ascending=False).head()

In [None]:
# top 5 lowest rated courses
course_stats.sort_values(by='bayesian_avg', ascending=True).head()

In [None]:
courses['Related_Interests'] = courses['Related_Interests'].apply(lambda x: x.split('|'))
courses.head()

In [None]:
from collections import Counter

interest_frequency = Counter(i for interests in courses['Related_Interests'] for i in interests)

print(f"There are {len(interest_frequency)} Related interests.")

interest_frequency

In [None]:
print("The 5 most common interests: \n", interest_frequency.most_common(5))

In [None]:
interest_frequency_df = pd.DataFrame([interest_frequency]).T.reset_index()
interest_frequency_df.columns= ['Related_Interests', 'count']

sns.barplot(x='Related_Interests', y='count', data=interest_frequency_df.sort_values(by='count', ascending=False).head())
plt.title("Distribution of Related Interests", fontsize=14)
plt.xticks(rotation=90)

In [None]:
from scipy.sparse import csr_matrix

def create_X(df):
    """
    Generates a sparse matrix from ratings dataframe.

    Args:
        df: pandas dataframe containing 3 columns (userId, courseId, rating)

    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        course_mapper: dict that maps course id's to course indices
        course_inv_mapper: dict that maps course indices to course id's
    """
    M = df['userId'].nunique()
    N = df['courseId'].nunique()

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
    course_mapper = dict(zip(np.unique(df["courseId"]), list(range(N))))

    user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
    course_inv_mapper = dict(zip(list(range(N)), np.unique(df["courseId"])))

    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [course_mapper[i] for i in df['courseId']]

    X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))

    return X, user_mapper, course_mapper, user_inv_mapper, course_inv_mapper

X, user_mapper, course_mapper, user_inv_mapper, course_inv_mapper = create_X(ratings)

In [None]:
X.shape

In [None]:
n_total =X.shape[0]*X.shape[1]
n_ratings =X.nnz
sparsity = n_ratings/n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%")

In [None]:
n_ratings_per_user = X.getnnz(axis=1)
len(n_ratings_per_user)

In [None]:
n_ratings_per_course = X.getnnz(axis=0)
len(n_ratings_per_course)

In [None]:
print(f"Most rated course has {n_ratings_per_course.max()} ratings.")
print(f"Least rated course has {n_ratings_per_course.min()} ratings.")

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(1,2,1)
sns.kdeplot(n_ratings_per_user, fill=True)
plt.xlim(0)
plt.title("Number of Ratings Per User", fontsize=14)
plt.xlabel("number of ratings per user")
plt.ylabel("density")
plt.subplot(1,2,2)
sns.kdeplot(n_ratings_per_course, fill=True)
plt.xlim(0)
plt.title("Number of Ratings Per course", fontsize=14)
plt.xlabel("number of ratings per course")
plt.ylabel("density")
plt.show()

In [None]:
from sklearn.neighbors import NearestNeighbors

def find_similar_courses(course_id, X, course_mapper, course_inv_mapper, k, metric='cosine'):
    """
    Finds k-nearest neighbours for a given course id.

    Args:
        course_id: id of the course of interest
        X: user-item utility matrix
        k: number of similar courses to retrieve
        metric: distance metric for kNN calculations

    Output: returns list of k similar course ID's
    """
    X = X.T
    neighbour_ids = []

    course_ind = course_mapper[course_id]
    course_vec = X[course_ind]
    if isinstance(course_vec, (np.ndarray)):
        course_vec = course_vec.reshape(1,-1)
    # use k+1 since kNN output includes the courseId of interest
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    kNN.fit(X)
    neighbour = kNN.kneighbors(course_vec, return_distance=False)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(course_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

In [None]:
similar_courses = find_similar_courses(1, X, course_mapper, course_inv_mapper, k=10)
similar_courses

In [None]:
# using cosine metrics
course_titles = dict(zip(courses['courseId'], courses['Course']))
# course_titles
course_id = 1

similar_courses = find_similar_courses(course_id, X, course_mapper, course_inv_mapper, metric='cosine', k=10)
course_title = course_titles[course_id]

print(f"Because you selected {course_title}:")
for i in similar_courses:
  print(course_titles[i])

Because you selected Bachelor of Accounting (B.Acc):
MSc in Accounting and Finance
Certificate in Accountancy
MBA in International Business
Ordinary Diploma in Taxation (ODTM)
MSc in Finance
MBA in International Trade and Logistic Operations
Postgraduate Diploma in Financial Management (PGDFM)
MSc in Finance and Investment
Postgraduate Diploma in Tax Management (PGDTM)


In [None]:
# using euclidean, you can replace it with manhattan for another view
course_titles = dict(zip(courses['courseId'], courses['Course']))
# course_titles
course_id = 1

similar_courses = find_similar_courses(course_id, X, course_mapper, course_inv_mapper, metric='euclidean', k=10)
course_title = course_titles[course_id]

print(f"Because you selected {course_title}:")
for i in similar_courses:
  print(course_titles[i])

Because you selected Bachelor of Accounting (B.Acc):
Certificate in Accountancy
Ordinary Diploma in Social Protection (ODSP)
Basic Technician Certificate in Social Protection (BTCSP)
Certificate in Banking and Finance
Ordinary Diploma in Banking (ODB)
Basic Technician Certificate in Accountancy (BTCA)
MSc in Information Technology and Management (MSc IT & Management)
MSc in Social Protection Policy and Development (MSc SPPD)
Postgraduate Diploma in Tax Management (PGDTM)


In [None]:
interests = set(i for I in courses['Related_Interests'] for i in I)
# interests
for i in interests:
    courses[i] = courses.Related_Interests.transform(lambda x: int(i in x))

course_interests = courses.drop(columns=['courseId', 'Course','Related_Interests', 'Faculty', 'Level'])

In [None]:
course_interests.head()

Unnamed: 0,Database Design,Software Installation,Bookkeeping,Risk Assessment,Employee Development,Logistics,Web Development,Operations Management,Labor Relations,Retail Banking,...,Algorithms,Financial Planning,Auditing,Performance Management,Actuarial Science,Accounting,Organizational Behavior,Supply Chain Management,IT Project Management,Corporate Finance
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(course_interests, course_interests)
print(f"Dimensions of our interests cosine similarity matrix: {cosine_sim.shape}")

Dimensions of our interests cosine similarity matrix: (42, 42)


In [None]:
pip install Fuzzywuzzy

Collecting Fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: Fuzzywuzzy
Successfully installed Fuzzywuzzy-0.18.0


In [None]:
from fuzzywuzzy import process

def course_finder(title):
    all_titles = courses['Course'].tolist()
    closest_match = process.extractOne(title, all_titles)
    return closest_match[0]

In [None]:
title = course_finder('finance')
title

'Bachelor of Banking and Finance (BBF)'

In [None]:
course_idx = dict(zip(courses['Course'], list(courses.index)))
idx = course_idx[title]
print(f"course index for Account: {idx}")

course index for Account: 1


In [None]:
n_recommendations=10
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores=sim_scores[1:n_recommendations+1]
sim_scores


[(23, 0.7999999999999999),
 (20, 0.39999999999999997),
 (22, 0.39999999999999997),
 (0, 0.19999999999999998),
 (2, 0.19999999999999998),
 (5, 0.19999999999999998),
 (21, 0.19999999999999998),
 (28, 0.19999999999999998),
 (36, 0.19999999999999998),
 (3, 0.0)]

In [None]:
similar_courses= [i[0] for i in sim_scores]
similar_courses

[23, 20, 22, 0, 2, 5, 21, 28, 36, 3]

In [None]:
print(f"Because you selected {title}:")
courses['Course'].iloc[similar_courses]

Because you selected Bachelor of Banking and Finance (BBF):


23    Postgraduate Diploma in Financial Management (...
20                        MSc in Finance and Investment
22                                       MSc in Finance
0                        Bachelor of Accounting (B.Acc)
2                     Ordinary Diploma in Banking (ODB)
5     Basic Technician Certificate in Banking and Fi...
21                        MSc in Accounting and Finance
28    Postgraduate Diploma in Business Administratio...
36                   Certificate in Banking and Finance
3                  Ordinary Diploma in Accounting (ODA)
Name: Course, dtype: object

In [None]:
def get_content_based_recommendations(title_string, n_recommendations=10):
    title = course_finder(title_string)
    idx = course_idx[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:(n_recommendations+1)]
    similar_courses = [i[0] for i in sim_scores]
    print(f"Because you selected {title}:")
    print(courses['Course'].iloc[similar_courses])

In [None]:
# provide the course keywords and how many recommendations you would like to
# generate
get_content_based_recommendations('it', 5)

Because you selected Bachelor of Science in Information Technology (BSc IT):
8     Ordinary Diploma in Information Technology (ODIT)
34    MSc in Information Technology and Management (...
10    Basic Technician Certificate in Computing and ...
38    Certificate in Computer and Information Techno...
41    Basic Technician Certificate in Computing and ...
Name: Course, dtype: object


In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=20, n_iter=10)
Q = svd.fit_transform(X.T)
Q.shape

(42, 20)

In [None]:
course_id = 1
similar_courses = find_similar_courses(course_id, Q.T, course_mapper, course_inv_mapper, metric='cosine', k=10)
course_title = course_titles[course_id]

print(f"Because you selected {course_title} similar courses are:")
for i in similar_courses:
    print(course_titles[i])

Because you selected Bachelor of Accounting (B.Acc) similar courses are:
MSc in Finance and Investment
Postgraduate Diploma in Financial Management (PGDFM)
Certificate in Accountancy
Ordinary Diploma in Social Protection (ODSP)
Certificate in Taxation
Bachelor of Science in Information Technology (BSc IT)
Postgraduate Diploma in Tax Management (PGDTM)
Certificate in Computer and Information Technology
MSc in Social Protection Policy and Development (MSc SPPD)
