<a href="https://colab.research.google.com/github/Aishwaryajakka/CourseRecommender/blob/machine_learning/scripts/Surprise_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# load_data.py
!pip install scikit-surprise
import pandas as pd
import numpy as np
from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly
from surprise import NormalPredictor
from surprise import SVD
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from collections import defaultdict
# This is to add in the non-taken classes
from sklearn.utils.extmath import cartesian

# [x] Import taken_course_c table and prepare for model.

#TODO replace csv with database connection for taken_course_c table
df = pd.read_csv("/content/drive/My Drive/Course Recommender/data/cleaned_data/taken_course_c.csv")
grade_key = pd.read_csv("/content/drive/My Drive/Course Recommender/data/cleaned_data/grade_c.csv")
course_key = pd.read_csv("/content/drive/My Drive/Course Recommender/data/cleaned_data/course_c.csv")
course_ix = course_key.drop_duplicates(subset="name").reset_index(drop=True).reset_index(drop=False).rename(columns={"index":"course_index"})[["name","course_index"]]
course_key = course_key.merge(course_ix, on="name", how="left")
print(df.shape)

# Remove grades we are not interested in.
df = df[df["grade_code"].str.startswith(('A','B','C','D','F')) & ~df["grade_code"].str.startswith('DNG')]
# Add Grade as Quality Points
df = df.merge(grade_key[["grade_code","quality_points"]], how="left", on="grade_code")
# Add course_index to use in place of the unreliable course_id
df = df.merge(course_key[["course_id","course_index"]], how="left", on="course_id")
# Remove missing course_index for courses not found in the course_c table.
## Report Missing Course IDs
#df[df.course_index.isnull()][["course_id"]].drop_duplicates().to_csv("/content/drive/My Drive/Course Recommender/data/cleaned_data/Missing Course IDs from Courses Table.csv")
df = df.dropna(subset=["course_index"])
# Arrange by grade and remove duplicates based on student_id and course_index
df = df.sort_values("grade_code").drop_duplicates(subset=("student_id","course_index"))
# For safety's sake let's reset the index
df.reset_index(drop=True, inplace=True)

# Table of not taken combinations
unique_students = np.unique(df.student_id)
unique_course_index = np.unique(df.course_index)
df_not_taken = pd.DataFrame(cartesian((unique_students, unique_course_index)), columns=("student_id","course_index"))
df_not_taken["rating"] = 0 
#outer_join = df.merge(df_not_taken, on=("student_id","course_index"), how = 'outer', indicator = True)
#anti_join = outer_join[~(outer_join._merge == 'both')].drop('_merge', axis = 1)
#keep_right = outer_join[(outer_join._merge == 'right')].drop('_merge', axis = 1)

# Check for missing data. 
# Column; number of missing values
print("## Check for missing data.")
print(df.isnull().sum())

# Option 1: Use Grades as Rating
df_grades = pd.DataFrame.copy(df)
df_grades["rating"] = df_grades["quality_points"]

# Option 2: Use Taken 1/0 as Rating
df_taken = pd.DataFrame.copy(df)
df_taken["rating"] = 1

# Option 1.B: Add not-taking a course as a negative rating 0 (Zero)
df_grades_0 = pd.DataFrame.copy(df_grades)
not_taken = df_not_taken.merge(df_grades[["student_id","course_index","rating"]], how="left", on=("student_id","course_index"))
not_taken = not_taken[not_taken.rating_y.isnull()].rename(columns={"rating_x":"rating"}).drop("rating_y", axis=1)
df_grades_0 = df_grades_0.append(not_taken, ignore_index=True)

# Option 2.B: Add not-taking a course as a negative rating 0 (Zero)
df_taken_0 = pd.DataFrame.copy(df_taken)
not_taken = df_not_taken.merge(df_grades[["student_id","course_index","rating"]], how="left", on=("student_id","course_index"))
not_taken = not_taken[not_taken.rating_y.isnull()].rename(columns={"rating_x":"rating"}).drop("rating_y", axis=1)
df_taken_0 = df_taken_0.append(not_taken, ignore_index=True)

(65980, 5)
## Check for missing data.
level_id          0
term_id           0
course_id         0
grade_code        0
student_id        0
quality_points    0
course_index      0
dtype: int64


In [4]:
# Looking at the data
print("######## Quality Points as Rating Approach #########")
print(df_grades.shape)
print(df_grades.rating.dtype)
print(df_grades.rating.value_counts())
print(df_grades.head())

######## Quality Points as Rating Approach #########
(11111, 8)
float64
4.00    4920
3.75    2099
3.00    1673
3.25    1518
2.75     467
2.00     197
2.25     143
0.25      46
1.00      22
1.75      17
1.25       5
0.75       4
Name: rating, dtype: int64
  level_id  term_id  course_id  ... quality_points  course_index  rating
0       GR     2171     113097  ...            4.0         188.0     4.0
1       GR     2131     150362  ...            4.0         294.0     4.0
2       GR     2144     113149  ...            4.0         225.0     4.0
3       GR     2144     113138  ...            4.0         220.0     4.0
4       GR     2134     113144  ...            4.0         189.0     4.0

[5 rows x 8 columns]


In [5]:
print("######## Taken=1 as Rating Approach #########")
print(df_taken.shape)
print(df_taken.rating.dtype)
print(df_taken.rating.value_counts())
print(df_taken.head())

######## Taken=1 as Rating Approach #########
(11111, 8)
int64
1    11111
Name: rating, dtype: int64
  level_id  term_id  course_id  ... quality_points  course_index  rating
0       GR     2171     113097  ...            4.0         188.0       1
1       GR     2131     150362  ...            4.0         294.0       1
2       GR     2144     113149  ...            4.0         225.0       1
3       GR     2144     113138  ...            4.0         220.0       1
4       GR     2134     113144  ...            4.0         189.0       1

[5 rows x 8 columns]


In [6]:
print("######## Quality Points as Rating Approach #########")
print(df_grades_0.shape)
print(df_grades_0.rating.dtype)
print(df_grades_0.rating.value_counts())
print(df_grades_0.head())

######## Quality Points as Rating Approach #########
(255360, 8)
float64
0.00    244249
4.00      4920
3.75      2099
3.00      1673
3.25      1518
2.75       467
2.00       197
2.25       143
0.25        46
1.00        22
1.75        17
1.25         5
0.75         4
Name: rating, dtype: int64
  level_id  term_id  course_id  ... quality_points  course_index  rating
0       GR   2171.0   113097.0  ...            4.0         188.0     4.0
1       GR   2131.0   150362.0  ...            4.0         294.0     4.0
2       GR   2144.0   113149.0  ...            4.0         225.0     4.0
3       GR   2144.0   113138.0  ...            4.0         220.0     4.0
4       GR   2134.0   113144.0  ...            4.0         189.0     4.0

[5 rows x 8 columns]


In [7]:
print("######## Quality Points as Rating Approach #########")
print(df_taken_0.shape)
print(df_taken_0.rating.dtype)
print(df_taken_0.rating.value_counts())
print(df_taken_0.head())

######## Quality Points as Rating Approach #########
(255360, 8)
int64
0    244249
1     11111
Name: rating, dtype: int64
  level_id  term_id  course_id  ... quality_points  course_index  rating
0       GR   2171.0   113097.0  ...            4.0         188.0       1
1       GR   2131.0   150362.0  ...            4.0         294.0       1
2       GR   2144.0   113149.0  ...            4.0         225.0       1
3       GR   2144.0   113138.0  ...            4.0         220.0       1
4       GR   2134.0   113144.0  ...            4.0         189.0       1

[5 rows x 8 columns]


In [0]:
# Load the data set into Surprise
## The reader is for the rating_scale. In this case we have a binary scale. 
#reader = Reader(rating_scale=(0.75, 4.0))
reader_taken = Reader(rating_scale=(0,1))
data_taken = Dataset.load_from_df(df_taken[['student_id', 'course_index', 'rating']], reader_taken)

reader_grades = Reader(rating_scale=(0.0,4.0))
data_grades = Dataset.load_from_df(df_grades[['student_id', 'course_index', 'rating']], reader_grades)

In [9]:
#cross_validate(BaselineOnly(), data_grades, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.5105  0.5137  0.4949  0.5366  0.5110  0.5133  0.0134  
MAE (testset)     0.3968  0.3972  0.3894  0.4018  0.4041  0.3979  0.0051  
Fit time          0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


{'fit_time': (0.013406991958618164,
  0.013029098510742188,
  0.013040781021118164,
  0.012502431869506836,
  0.012715816497802734),
 'test_mae': array([0.39684699, 0.39718995, 0.38935763, 0.40176157, 0.40414909]),
 'test_rmse': array([0.51051509, 0.51365214, 0.49491665, 0.5366471 , 0.51099639]),
 'test_time': (0.009784936904907227,
  0.008747577667236328,
  0.008242368698120117,
  0.008115768432617188,
  0.008207559585571289)}

In [10]:
#cross_validate(NormalPredictor(), data_grades, cv=2)

{'fit_time': (0.007627010345458984, 0.00759577751159668),
 'test_mae': array([0.54773416, 0.55162873]),
 'test_rmse': array([0.72546806, 0.72454103]),
 'test_time': (0.036757469177246094, 0.03740048408508301)}

In [0]:
# Determining the strongest approach
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

In [12]:
gs.fit(data_grades)

# best RMSE score
print(gs.best_score['rmse'])
# best MAE score
print(gs.best_score['mae'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['mae'])

0.5216315747914478
0.41182017046108976
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


Best SVD option for using Grades as rating

0.4896321029052156
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}

Best SVD option for using Taken=1 as rating.

0.014513730229995843
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}

In [13]:
gs.fit(data_taken)

# best RMSE score
print(gs.best_score['rmse'])
# best MAE score
print(gs.best_score['mae'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['mae'])

0.012035013704141048
0.0029405187154376952
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}


In [0]:
# Function to get Top-n recommendations
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n