In [1]:
import numpy as np
import pandas as pd
import sys

In [2]:
user_rating = pd.read_csv('ratings.dat',delimiter='::', names=['UserID', 'MovieID', 'Rating', 'DateTime'], header=None)

  if __name__ == '__main__':


In [3]:
user_rating['DateTime'] = pd.to_datetime(user_rating['DateTime'], unit='s')

In [4]:
user_rating.head()

Unnamed: 0,UserID,MovieID,Rating,DateTime
0,1,1193,5,2000-12-31 22:12:40
1,1,661,3,2000-12-31 22:35:09
2,1,914,3,2000-12-31 22:32:48
3,1,3408,4,2000-12-31 22:04:35
4,1,2355,5,2001-01-06 23:38:11


In [5]:
user_rating["Favourable"] = user_rating["Rating"] > 3

In [6]:
user_rating.head()

Unnamed: 0,UserID,MovieID,Rating,DateTime,Favourable
0,1,1193,5,2000-12-31 22:12:40,True
1,1,661,3,2000-12-31 22:35:09,False
2,1,914,3,2000-12-31 22:32:48,False
3,1,3408,4,2000-12-31 22:04:35,True
4,1,2355,5,2001-01-06 23:38:11,True


** training set containing 200 ratings **

In [7]:
rating = user_rating[user_rating["UserID"].isin(range(200))]

** creating new dataframe of only favourable rows **

In [8]:
favourable_rating = rating[rating["Favourable"]]

In [9]:
favourable_rating.head()

Unnamed: 0,UserID,MovieID,Rating,DateTime,Favourable
0,1,1193,5,2000-12-31 22:12:40,True
3,1,3408,4,2000-12-31 22:04:35,True
4,1,2355,5,2001-01-06 23:38:11,True
6,1,1287,5,2000-12-31 22:33:59,True
7,1,2804,5,2000-12-31 22:11:59,True


** creating dictionary which contain UserID as key and movie rated by him as value. **

In [10]:
favourable_movie_by_users = dict((k, frozenset(v.values)) for k, v in favourable_rating.groupby('UserID')["MovieID"]) 

In [11]:
num_review_favourable_movie = rating[["MovieID", "Favourable"]].groupby("MovieID").sum()
num_review_favourable_movie["Favourable"] = num_review_favourable_movie["Favourable"].astype('int')
num_review_favourable_movie.sort_values("Favourable", ascending=False, inplace=True)

In [12]:
num_review_favourable_movie[:5]

Unnamed: 0_level_0,Favourable
MovieID,Unnamed: 1_level_1
2858,106
2028,85
1196,83
260,80
3578,77


** frequent_itemset is a dictionary used to store discovered itemset which satisfy min_support requirement. **

In [13]:
frequent_itemset = {}

In [14]:
min_support = 50

In [15]:
frequent_itemset[1] = dict((frozenset((movie_id,)),
                            row["Favourable"]) for movie_id, row in num_review_favourable_movie.iterrows() if row["Favourable"] > min_support)

In [16]:
frequent_itemset[1]

{frozenset({2858}): 106,
 frozenset({608}): 58,
 frozenset({1265}): 53,
 frozenset({527}): 62,
 frozenset({589}): 59,
 frozenset({2571}): 70,
 frozenset({260}): 80,
 frozenset({858}): 53,
 frozenset({2028}): 85,
 frozenset({1196}): 83,
 frozenset({1097}): 52,
 frozenset({318}): 68,
 frozenset({1198}): 75,
 frozenset({1580}): 51,
 frozenset({2762}): 71,
 frozenset({480}): 69,
 frozenset({593}): 63,
 frozenset({2396}): 59,
 frozenset({1197}): 58,
 frozenset({296}): 60,
 frozenset({110}): 59,
 frozenset({356}): 60,
 frozenset({1210}): 72,
 frozenset({1}): 53,
 frozenset({1270}): 52,
 frozenset({3578}): 77}

In [17]:
from collections import defaultdict

def find_frequent_itemsets(favourable_movie_by_user, k_1_itemset, min_support):
    count = defaultdict(int)
    for user, review in favourable_movie_by_users.items():
        for itemset in k_1_itemset:
            if itemset.issubset(review):
                for other_reviewed_movies in review - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movies,))
                    count[current_superset] += 1
    
    
    return dict((itemset, frequency) for itemset, frequency in count.items() if frequency >= min_support)                    

In [18]:
for k in range(2, 6):  ## range can be large as per your processing power.
    curr_frequent_itemset = find_frequent_itemsets(favourable_movie_by_users, frequent_itemset[k - 1], min_support)
    frequent_itemset[k] = curr_frequent_itemset
    
    if len(curr_frequent_itemset) == 0:
        print("Did not find any frequent itemset of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("Found {} freqent itemset of length {}".format(len(curr_frequent_itemset), k))
        sys.stdout.flush()
        
del frequent_itemset[1]

Found 268 freqent itemset of length 2
Found 1447 freqent itemset of length 3
Found 4938 freqent itemset of length 4
Found 11626 freqent itemset of length 5


In [19]:
candidate_rules = []  ## This list store all the rules that are discovered from the dataset using min_support.

In [20]:
for itemset_length, itemset_count in frequent_itemset.items():
    for itemset in itemset_count.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))

In [21]:
candidate_rules[:5]

[(frozenset({1198}), 296),
 (frozenset({296}), 1198),
 (frozenset({2028}), 480),
 (frozenset({480}), 2028),
 (frozenset({260}), 1097)]

In [22]:
correct_counts = defaultdict(int)  ## store all valid rules
incorrect_counts = defaultdict(int) ## store all invalid rules

In [23]:
for user, review in favourable_movie_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(review):
            if conclusion in review:
                correct_counts[(premise, conclusion)] += 1
            else:
                incorrect_counts[(premise, conclusion)] += 1

rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules}

In [24]:
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)

for i in range(5):
    print("Rule #{}".format(i+1))
    premise, conclusion = sorted_confidence[i][0]
    print("If People Recommend {0} They will also recommend {1}".format(premise, conclusion))
    print("- conclusion : {}".format(rule_confidence[(premise, conclusion)]))

Rule #1
If People Recommend frozenset({1097, 1198, 1270}) They will also recommend 1196
- conclusion : 1.0
Rule #2
If People Recommend frozenset({318, 110, 260, 1270}) They will also recommend 527
- conclusion : 1.0
Rule #3
If People Recommend frozenset({608, 1097, 2858, 1270}) They will also recommend 1196
- conclusion : 1.0
Rule #4
If People Recommend frozenset({1210, 1097, 2762, 2571}) They will also recommend 1198
- conclusion : 1.0
Rule #5
If People Recommend frozenset({1210, 2028, 260, 1198}) They will also recommend 1196
- conclusion : 1.0


In [25]:
movies_data = pd.read_csv('movies.dat', header=None, delimiter="::", encoding='mac-roman', names=["MovieID", "Title", "Genre"])
movies_data.head()

  if __name__ == '__main__':


Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [26]:
def get_Title(movie_id):
    title_object = movies_data[movies_data["MovieID"] == movie_id]["Title"]
    title = title_object.values[0]
    return title

In [27]:
for i in range(5):
    print("Rule #{}".format(i+1))
    premise, conclusion = sorted_confidence[i][0]
    premise_name = ', '.join(get_Title(idx) for idx in premise)
    conclusion_name = get_Title(conclusion)
    print("If People Recommend '{0}' They will also recommend '{1}'".format(premise_name, conclusion_name))
    print("- conclusion : {}".format(rule_confidence[(premise, conclusion)]))
    print("")

Rule #1
If People Recommend 'E.T. the Extra-Terrestrial (1982), Raiders of the Lost Ark (1981), Back to the Future (1985)' They will also recommend 'Star Wars: Episode V - The Empire Strikes Back (1980)'
- conclusion : 1.0

Rule #2
If People Recommend 'Shawshank Redemption, The (1994), Braveheart (1995), Star Wars: Episode IV - A New Hope (1977), Back to the Future (1985)' They will also recommend 'Schindler's List (1993)'
- conclusion : 1.0

Rule #3
If People Recommend 'Fargo (1996), E.T. the Extra-Terrestrial (1982), American Beauty (1999), Back to the Future (1985)' They will also recommend 'Star Wars: Episode V - The Empire Strikes Back (1980)'
- conclusion : 1.0

Rule #4
If People Recommend 'Star Wars: Episode VI - Return of the Jedi (1983), E.T. the Extra-Terrestrial (1982), Sixth Sense, The (1999), Matrix, The (1999)' They will also recommend 'Raiders of the Lost Ark (1981)'
- conclusion : 1.0

Rule #5
If People Recommend 'Star Wars: Episode VI - Return of the Jedi (1983), Savin

In [28]:
## working on the test dataset

In [29]:
test_dataset = user_rating[-user_rating["UserID"].isin(range(2200))]

In [30]:
test_favourable = test_dataset[test_dataset['Favourable']]

In [31]:
test_favourable_by_user = dict((k, frozenset(v.values)) for k, v in test_favourable.groupby('UserID')["MovieID"])

In [32]:
correct_test_counts = defaultdict(int)
incorrect_test_counts = defaultdict(int)

for user, review in test_favourable_by_user.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(review):
            if conclusion in review:
                correct_test_counts[(premise, conclusion)] += 1
            else:
                incorrect_test_counts[(premise, conclusion)] += 1

test_confidence = {candidate_rule: correct_test_counts[candidate_rule] / float(correct_test_counts[candidate_rule] + incorrect_test_counts[candidate_rule]) for candidate_rule in candidate_rules}

In [33]:
sorted_test_confidence = sorted(test_confidence.items(), key=itemgetter(1), reverse=True)

for i in range(int(len(test_favourable)/100)):
    print("Rule #{}".format(i+1))
    premise, conclusion = sorted_test_confidence[i][0]
    premise_name = ', '.join(get_Title(idx) for idx in premise)
    conclusion_name = get_Title(conclusion)
    print("If People Recommend '{0}' They will also recommend '{1}'".format(premise_name, conclusion_name))
    print("- train conclusion : {}".format(rule_confidence[(premise, conclusion)]))
    print("- test conclusion : {}".format(test_confidence[(premise, conclusion)]))
    print("")

Rule #1
If People Recommend 'E.T. the Extra-Terrestrial (1982), Star Wars: Episode VI - Return of the Jedi (1983), Saving Private Ryan (1998), Raiders of the Lost Ark (1981)' They will also recommend 'Star Wars: Episode V - The Empire Strikes Back (1980)'
- train conclusion : 1.0
- test conclusion : 0.974169741697417

Rule #2
If People Recommend 'E.T. the Extra-Terrestrial (1982), Star Wars: Episode VI - Return of the Jedi (1983), Matrix, The (1999), Saving Private Ryan (1998)' They will also recommend 'Star Wars: Episode V - The Empire Strikes Back (1980)'
- train conclusion : 1.0
- test conclusion : 0.9741379310344828

Rule #3
If People Recommend 'Groundhog Day (1993), Star Wars: Episode VI - Return of the Jedi (1983), Matrix, The (1999), Schindler's List (1993)' They will also recommend 'Star Wars: Episode V - The Empire Strikes Back (1980)'
- train conclusion : 0.8666666666666667
- test conclusion : 0.9735449735449735

Rule #4
If People Recommend 'E.T. the Extra-Terrestrial (1982),

** We performed training on subset of our data to find assocoation rules and then test this rules on the test set. We could extend this concept to use cross-fold validation to better evaluate the rules. This would lead to more robust evaluation of quality of each rule. **