In [1]:
import pandas as pd
all_ratings = pd.read_csv("u.data",delimiter = "\t",header = None ,names = ["UserID","MovieID","Rating","Datetime"])
all_ratings["Datetime"] = pd.to_datetime(all_ratings['Datetime'],unit='s')
all_ratings[:5]

Unnamed: 0,UserID,MovieID,Rating,Datetime
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [3]:
all_ratings["Favorable"] = all_ratings["Rating"] > 3
all_ratings[:5]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
0,196,242,3,1997-12-04 15:55:49,False
1,186,302,3,1998-04-04 19:22:22,False
2,22,377,1,1997-11-07 07:18:36,False
3,244,51,2,1997-11-27 05:02:03,False
4,166,346,1,1998-02-02 05:33:16,False


In [8]:
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]
favorable_ratings = ratings[ratings["Favorable"]]
print(favorable_ratings)

       UserID  MovieID  Rating            Datetime  Favorable
16        122      387       5 1997-11-11 17:47:39       True
20        119      392       4 1998-01-30 16:13:34       True
21        167      486       4 1998-04-16 14:54:12       True
26         38       95       5 1998-04-13 01:14:54       True
28         63      277       4 1997-10-01 23:10:01       True
...       ...      ...     ...                 ...        ...
99848       5      174       5 1997-09-30 16:15:30       True
99950     130       93       5 1997-09-22 18:41:05       True
99951     130      121       5 1997-10-07 18:59:06       True
99959     193      690       4 1998-03-05 18:40:21       True
99978     113      975       5 1997-10-04 03:40:24       True

[11043 rows x 5 columns]


In [19]:
favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["MovieID"])

len(favorable_reviews_by_users)

199

In [23]:
num_favorable_by_movie = ratings[["MovieID", "Favorable"]].groupby("MovieID").sum()
num_favorable_by_movie.sort_values("Favorable", ascending=False)[:5]

Unnamed: 0_level_0,Favorable
MovieID,Unnamed: 1_level_1
50,100.0
100,89.0
258,83.0
181,79.0
174,74.0


In [32]:
frequent_itemsets = {}
min_support = 50
import sys
frequent_itemsets[1] = dict((frozenset((movie_id,)),row["Favorable"]) 
                            for movie_id, row in num_favorable_by_movie.iterrows() 
                            if row["Favorable"] > min_support)
from collections import defaultdict
def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets,min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])

In [33]:
for k in range(2, 20):
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users,
                                                    frequent_itemsets[k-1],
                                                    min_support)
    frequent_itemsets[k] = cur_frequent_itemsets
    if len(cur_frequent_itemsets) == 0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("I found {} frequent itemsets of length{}".format(len(cur_frequent_itemsets), k))
        sys.stdout.flush()
del frequent_itemsets[1]

I found 93 frequent itemsets of length2
I found 295 frequent itemsets of length3
I found 593 frequent itemsets of length4
I found 785 frequent itemsets of length5
I found 677 frequent itemsets of length6
I found 373 frequent itemsets of length7
I found 126 frequent itemsets of length8
I found 24 frequent itemsets of length9
I found 2 frequent itemsets of length10
Did not find any frequent itemsets of length 11


In [51]:
for index,values in frequent_itemsets.items():
    print(index)
    print(len(values))

2
93
3
295
4
593
5
785
6
677
7
373
8
126
9
24
10
2
11
0


In [52]:
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))
print(candidate_rules[:5])

[(frozenset({7}), 1), (frozenset({1}), 7), (frozenset({50}), 1), (frozenset({1}), 50), (frozenset({1}), 56)]


In [53]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

In [55]:
rule_confidence = {candidate_rule: correct_counts[candidate_rule]/ float(correct_counts[candidate_rule] +
                    incorrect_counts[candidate_rule])
                    for candidate_rule in candidate_rules}

from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(),key=itemgetter(1), reverse=True)
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise, conclusion))
    print(" - Confidence:{0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

NameError: name 'rule_confidence' is not defined