In [1]:
# data from http://grouplens.org/datasets/movielens/

In [3]:
import pandas as pd

In [4]:
all_ratings = pd.read_csv("u.data", delimiter="\t", header=None, names = ["UserID", "MovieID", "Rating", "Datetime"]) 
all_ratings["Datetime"] = pd.to_datetime(all_ratings['Datetime'],unit='s')
# transform the time format from string to pandas timestamp
all_ratings[:5]

Unnamed: 0,UserID,MovieID,Rating,Datetime
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [5]:
# As you can see, there are no review for most movies, such as #213
all_ratings[all_ratings["UserID"] == 675].sort_values("MovieID") 

# the sort command in pandas is [sort_values('')] 

Unnamed: 0,UserID,MovieID,Rating,Datetime
81098,675,86,4,1998-03-10 00:26:14
90696,675,223,1,1998-03-10 00:35:51
92650,675,235,1,1998-03-10 00:35:51
95459,675,242,4,1998-03-10 00:08:42
82845,675,244,3,1998-03-10 00:29:35
53293,675,258,3,1998-03-10 00:11:19
97286,675,269,5,1998-03-10 00:08:07
93720,675,272,3,1998-03-10 00:07:11
73389,675,286,4,1998-03-10 00:07:11
77524,675,303,5,1998-03-10 00:08:42


In [6]:
# Not all reviews are favourable! Our goal is "other recommended books", so we only want favourable reviews
all_ratings["Favorable"] = all_ratings["Rating"] > 3
all_ratings[3:8]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
3,244,51,2,1997-11-27 05:02:03,False
4,166,346,1,1998-02-02 05:33:16,False
5,298,474,4,1998-01-07 14:20:06,True
6,115,265,2,1997-12-03 17:51:28,False
7,253,465,5,1998-04-03 18:34:27,True


In [7]:
all_ratings[all_ratings["UserID"] == 1][:5]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
202,1,61,4,1997-11-03 07:33:40,True
305,1,189,3,1998-03-01 06:15:28,False
333,1,33,4,1997-11-03 07:38:19,True
334,1,160,4,1997-09-24 03:42:27,True
478,1,20,4,1998-02-14 04:51:23,True


In [11]:
# Sample the dataset. You can try increasing the size of the sample, but the run time will be considerably longer
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]  
#  isin(value): Return a boolean Series. If UserID is in [0-199], return True 
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
0,196,242,3,1997-12-04 15:55:49,False
1,186,302,3,1998-04-04 19:22:22,False
2,22,377,1,1997-11-07 07:18:36,False
4,166,346,1,1998-02-02 05:33:16,False
6,115,265,2,1997-12-03 17:51:28,False


In [12]:
# We start by creating a dataset of each user's favourable reviews
favorable_ratings = ratings[ratings["Favorable"]]
favorable_ratings[:5]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
16,122,387,5,1997-11-11 17:47:39,True
20,119,392,4,1998-01-30 16:13:34,True
21,167,486,4,1998-04-16 14:54:12,True
26,38,95,5,1998-04-13 01:14:54,True
28,63,277,4,1997-10-01 23:10:01,True


In [13]:
# We are only interested in the reviewers who have more than one review
favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["MovieID"])

# 把相同UserID 的 ["MovieID"] 做聚合, 以 frozenset(v.values)把["MovieID"]內容變集合

In [14]:
# Find out how many movies have favourable ratings
num_favorable_by_movie = ratings[["MovieID", "Favorable"]].groupby("MovieID").sum()
# 以同樣 "MovieID" 為基準,將 "Favorable" 的值 的相加 (True=1, False=0)

num_favorable_by_movie.sort_values("Favorable", ascending=False)[:5]

Unnamed: 0_level_0,Favorable
MovieID,Unnamed: 1_level_1
50,100.0
100,89.0
258,83.0
181,79.0
174,74.0


In [15]:
from collections import defaultdict

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    '''
    input:      
    favorable_reviews_by_users:  dict, {userID: 評價超過3的電影ID}
    k_1_itemsets: dict, {k項目的電影ID組合 : 出現次數(support)}
    min_support : float, min support
    
    output:
    a dict:  {超過最小支持度的項目(k+1項) : 出現次數(support)}
    '''
    
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        # ruser: userID,  reviews: movie reviews of this user 
        
        for itemset in k_1_itemsets:
            # itemset: 超過 min support 的電影ID組合 EX: frozenset({313}
            
            if itemset.issubset(reviews): # 如果 itemset 是 reviews 的子集合，輸出為 True
                for other_reviewed_movie in reviews - itemset: 
                    # other_reviewed_movie: reviews 與 itemset 的差集
                    #print(other_reviewed_movie)
                    
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    #print(current_superset)
                    
                    counts[current_superset] += 1 #算current_superset出現次數
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])

In [31]:
import sys

frequent_itemsets = {}  # itemsets are sorted by length
min_support = 50

# k=1 candidates are the isbns with more than min_support favourable reviews
frequent_itemsets[1] = dict((frozenset((movie_id,)), row["Favorable"])
                                for movie_id, row in num_favorable_by_movie.iterrows()
                                if row["Favorable"] > min_support)

#宣告一個字典: {length=1:  {MovieID: 支持度} }
# DataFrame.iterrows(): Iterate over DataFrame rows as (index, Series) pairs.
# num_favorable_by_movie is the frequency of like of one movie 


#print("There are {} movies with more than {} favorable reviews".format(len(frequent_itemsets[1]), min_support))

sys.stdout.flush() 
for k in range(2, 20):
    # Generate candidates of length k, using the frequent itemsets of length k-1
    # Only store the frequent itemsets
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1],
                                                   min_support)
    # Call find_frequent_itemsets function: return a dict:  {超過最小支持度的項目(k+1項)，出現次數}
    
    if len(cur_frequent_itemsets) == 0:
        #print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        #print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        #print(cur_frequent_itemsets)
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets
# We aren't interested in the itemsets of length 1, so remove those
del frequent_itemsets[1]

In [12]:
print("Found a total of {0} frequent itemsets".format(sum(len(itemsets) for itemsets in frequent_itemsets.values())))

Found a total of 2968 frequent itemsets


In [13]:
# Now we create the association rules. First, they are candidates until the confidence has been tested
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    # itemset_length: 1,2,3....
    # itemset_counts: a dict:  {MovieID set: 支持度}
    
    for itemset in itemset_counts.keys():
        # itemset: MovieID set
        
        for conclusion in itemset:
            #conclusion: each MovieID in one MovieID set
            premise = itemset - set((conclusion,)) 
            candidate_rules.append((premise, conclusion))
            # n 個元素可以產生n個規則， EX: (1,2,3) =>{(1,2),3}、 {(3,1),2}、{(2,3),1}
print("There are {} candidate rules".format(len(candidate_rules)))

There are 15285 candidate rules


In [35]:
print(candidate_rules[:5])

[(frozenset({7}), 1), (frozenset({1}), 7), (frozenset({50}), 1), (frozenset({1}), 50), (frozenset({1}), 56)]


In [None]:
# Now, we compute the confidence of each of these rules. This is very similar to what we did in chapter 1
correct_counts = defaultdict(int) 
incorrect_counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
    # reviews: 每一個 user 喜歡的MovieID
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
                # 如果 premise 出現 屬於 reviews & conclusion 也屬於 reviews, correct_counts[candidate_rule]累計加1
            else:
                incorrect_counts[candidate_rule] += 1
                # 如果 premise 出現 屬於 reviews 但是 conclusion 不屬於 reviews, incorrect_counts[candidate_rule]累計 + 1
                
rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
              for candidate_rule in candidate_rules}

In [None]:
# Choose only rules above a minimum confidence level
min_confidence = 0.9

In [None]:
# Filter out the rules with poor confidence
rule_confidence = {rule: confidence for rule, confidence in rule_confidence.items() if confidence > min_confidence}
print(len(rule_confidence))

In [None]:
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)

In [None]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise, conclusion))
    print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

In [None]:
# Even better, we can get the movie titles themselves from the dataset
movie_name_data = pd.read_csv("u.item", delimiter="|", header=None, encoding = "mac-roman")
movie_name_data.columns = ["MovieID", "Title", "Release Date", "Video Release", "IMDB", "<UNK>", "Action", "Adventure",
                           "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir",
                           "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

In [None]:
def get_movie_name(movie_id):
    title_object = movie_name_data[movie_name_data["MovieID"] == movie_id]["Title"]
    title = title_object.values[0]
    return title

In [None]:
get_movie_name(4)

In [None]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name))
    print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

In [1]:
a = tuple([1,2])

In [2]:
b = set([1,2,3])

In [4]:
set(a).issubset(b)

True