# Objective: recommend suitable course bundles for students with application of association rules to increase likelihood for purchase

In [5]:
%matplotlib inline

from pathlib import Path
import heapq
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

## you may need to install mlxtend
import sys
!{sys.executable} -m pip install mlxtend

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules



In [6]:
course_df = pd.read_csv('Coursetopics.csv')
course_df['Student'] = np.arange(len(course_df))
course_df.set_index('Student', inplace=True)
course_df

Unnamed: 0_level_0,Intro,DataMining,Survey,Cat Data,Regression,Forecast,DOE,SW
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0
2,0,1,0,1,1,0,0,1
3,1,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
360,0,0,0,1,0,0,0,0
361,0,1,0,1,0,0,0,1
362,0,0,0,0,0,0,0,1
363,0,0,0,1,0,0,0,0


In [7]:
#create frequent itemsets
itemsets = apriori(course_df,min_support=0.02, use_colnames=True) 
#smaller support value threshold for itemsets since percent of transactions that include some necessary itemsets is low
#print(itemsets)
#converting into rules
rules = association_rules(itemsets, metric='confidence', min_threshold=0.10)
rules.sort_values(by=['lift'], ascending=False)
#lift helps see how much better  the chance of getting the consequent if you use the rule than if you select randomly.
rules = rules.drop(columns=['antecedent support', 'consequent support', 'conviction'])
rules.sort_values(by='confidence',ascending=False).head(15)

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage
102,"(Regression, DOE)",(Intro),0.024658,0.818182,2.073864,0.012768
106,"(SW, Regression)",(Intro),0.038356,0.7,1.774306,0.016739
76,"(SW, Survey)",(Intro),0.032877,0.666667,1.689815,0.013421
113,"(Intro, DOE)",(SW),0.030137,0.647059,2.915759,0.019801
116,"(Regression, DataMining)",(Cat Data),0.027397,0.625,3.001645,0.01827
62,"(Regression, DataMining)",(Intro),0.027397,0.625,1.584201,0.010103
82,"(Regression, Cat Data)",(Intro),0.032877,0.6,1.520833,0.011259
123,"(Survey, Forecast)",(Cat Data),0.021918,0.571429,2.744361,0.013931
72,"(Survey, Forecast)",(Intro),0.021918,0.571429,1.448413,0.006786
118,"(DataMining, Cat Data)",(Regression),0.027397,0.555556,2.668129,0.017129


In [8]:
! pip install scikit-surprise

from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split

Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 7.7MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1617551 sha256=f3092c98f842b99d0f5e5835dd328fce9a08000420a654da3c386e40e5462d1f
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [9]:
def convert(data):
  result = data.stack().reset_index()
  result.columns = ['userID', 'itemID', 'rating']

  return result

In [10]:
course_df_converted = convert(course_df)
course_df_converted

Unnamed: 0,userID,itemID,rating
0,0,Intro,1
1,0,DataMining,1
2,0,Survey,0
3,0,Cat Data,0
4,0,Regression,0
...,...,...,...
2915,364,Cat Data,0
2916,364,Regression,1
2917,364,Forecast,0
2918,364,DOE,0


In [11]:
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    byUser = defaultdict(list)
    for p in predictions:
        byUser[p.uid].append(p)
    
    # For each user, reduce predictions to top-n
    for uid, userPredictions in byUser.items():
        byUser[uid] = heapq.nlargest(n, userPredictions, key=lambda p: p.est)
    return byUser

In [12]:
# Convert these data set into the format required by the surprise package
# The columns must correspond to user id, item id and ratings (in that order)

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(course_df_converted[['userID', 'itemID', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.25, random_state=1)


# User-based filtering

In [13]:
# compute cosine similarity between users 
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

# Then predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=4)

# Print the recommended items for each user
print()
print('Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:5]:
    print('User {}'.format(uid))
    for prediction in user_ratings:
        print('  Item {0.iid} ({0.est:.2f})'.format(prediction), end='')
    print()
print()



Computing the cosine similarity matrix...
Done computing similarity matrix.

Top-4 recommended items for each user
User 91
  Item Cat Data (0.21)  Item Regression (0.21)  Item DOE (0.21)  Item Intro (0.21)
User 282
  Item Intro (0.21)  Item Forecast (0.21)  Item Regression (0.21)  Item Survey (0.21)
User 66
  Item Regression (0.13)  Item Survey (0.12)
User 285
  Item Cat Data (0.12)  Item DataMining (0.12)
User 95
  Item Forecast (0.24)  Item Survey (0.22)



  sim = construction_func[name](*args)


# For student groups who have purchased the Regression and Forecast courses, what should we recommend to them? 

In [14]:
# need to create sample test data for prediction for a student who purchased Regression and Forecast book

sample_df = pd.DataFrame({'Intro':[0], 'DataMining': [0], 'Survey': [0], 'Cat Data': [0], 'Regression':[1], 'Forecast': [1], 'DOE':[0], 'SW': [0]})
sample_df

Unnamed: 0,Intro,DataMining,Survey,Cat Data,Regression,Forecast,DOE,SW
0,0,0,0,0,1,1,0,0


In [15]:
sample_df_converted = convert(sample_df)
sample_df_converted

Unnamed: 0,userID,itemID,rating
0,0,Intro,0
1,0,DataMining,0
2,0,Survey,0
3,0,Cat Data,0
4,0,Regression,1
5,0,Forecast,1
6,0,DOE,0
7,0,SW,0


In [16]:
reader = Reader(rating_scale=(0,1))
data = Dataset.load_from_df(sample_df_converted[['userID', 'itemID', 'rating']], reader)

trainset, testet = train_test_split(data, test_size=1, random_state=1)

In [17]:
predictions = algo.test(trainset.build_testset())