In [1]:
# import pandas
import pandas as pd  

# from apyori import apriori
from apyori import apriori

In [2]:
# load data 
movie_data = pd.read_csv('https://raw.githubusercontent.com/pirandello/apriori/master/movie_dataset.csv', 
                         header = None)

* explore the dataset

In [3]:
print(len(movie_data))

7501


In [5]:
movie_data.shape

(7501, 20)

In [4]:
movie_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,The Revenant,13 Hours,Allied,Zootopia,Jigsaw,Achorman,Grinch,Fast and Furious,Ghostbusters,Wolverine,Mad Max,John Wick,La La Land,The Good Dunosaur,Ninja Turtles,The Good Dunosaur Bad Moms,2 Guns,Inside Out,Valerian,Spiderman 3
1,Beirut,Martian,Get Out,,,,,,,,,,,,,,,,,
2,Deadpool,,,,,,,,,,,,,,,,,,,
3,X-Men,Allied,,,,,,,,,,,,,,,,,,
4,Ninja Turtles,Moana,Ghost in the Shell,Ralph Breaks the Internet,John Wick,,,,,,,,,,,,,,,


* transform dataframe to list of lists (suitable format for apyori)

In [6]:
# outerlist of movies, each transation is an inner list
# 20 is number of columns
# outerlist  list = list of transactions (length of DF) 
# inner list = movies that are seen together up to 20
data = [] 
for i in range(0, len(movie_data)):
    data.append([str(movie_data.values[i,j]) for j in range(0,20)])

In [15]:
print(data[1][:])

['Beirut', 'Martian', 'Get Out', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan']


* instantiate apriori and set the input params based on following constraints:
    * we want only movies that are purchased at least 40 times
    * the minimum confidence for the rules is 20%
    * the minumum lift is 3

In [25]:
# 40 times / len(movie_data) to get percentage, min_length can be added to have at least 2 products
min_sup = 40/len(movie_data)
association_rules = apriori(data, min_support=min_sup, min_confidence=0.20, min_lift=3, min_length=2) # creates a generator/iterable

In [26]:
association_res = list(association_rules) # turned to list

* how many association rules did we obtained ?

In [27]:
len(association_res)

32

* print the first association rule

In [28]:
print(association_res[0])

RelationRecord(items=frozenset({'Green Lantern', 'Red Sparrow'}), support=0.005732568990801226, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Red Sparrow'}), items_add=frozenset({'Green Lantern'}), confidence=0.3006993006993007, lift=3.790832696715049)])


In [29]:
print(association_res[0][0])

frozenset({'Green Lantern', 'Red Sparrow'})


In [33]:
print(association_res[1][:7])

(frozenset({'Green Lantern', 'Star Wars'}), 0.005865884548726837, [OrderedStatistic(items_base=frozenset({'Star Wars'}), items_add=frozenset({'Green Lantern'}), confidence=0.3728813559322034, lift=4.700811850163794)])


In [35]:
print(association_res[0][2][0][2])

0.3006993006993007


In [47]:
association_res[-1]

RelationRecord(items=frozenset({'Moana', 'nan', 'Tomb Rider', 'Spiderman 3'}), support=0.007199040127982935, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Moana', 'Tomb Rider'}), items_add=frozenset({'nan', 'Spiderman 3'}), confidence=0.20300751879699247, lift=3.088761457396025), OrderedStatistic(items_base=frozenset({'Moana', 'nan', 'Tomb Rider'}), items_add=frozenset({'Spiderman 3'}), confidence=0.20300751879699247, lift=3.0825089038385434)])

In [48]:
print((value0, value1, value2, value3, value4))
# name of movie, movie2, support, confidence, lift
# note we are using pairs of movies set in min_length

('Moana', 'nan', '0.00719', '0.20300', '3.08876')


* convert association rules to DataFrame 
    * use these columns: title_1, title_2, support, confidence, lift

In [36]:
results=[] 
for res in association_res:
    # index of inner list - base item + add item
    pair = res[0] # gets the pairs
    items = [x for x in pair] # extracts as a list 

    value0 = str(items[0]) #takes the first 
    value1 = str(items[1]) # takes second item

    # second index of inner list
    value2 = str(res[1])[:7] # takes all 7 items 

    #third index of the list located at 0th
    #of the third index of the inner list

    value3 = str(res[2][0][2])[:7] 
    value4 = str(res[2][0][3])[:7]

    rows = (value0, value1, value2, value3, value4)
    results.append(rows)

labels = ['Title 1','Title 2','Support','Confidence','Lift']
movie_suggestion = pd.DataFrame.from_records(results, columns = labels)

In [46]:
print(value4)

3.08876


In [49]:
movie_suggestion.sort_values(by='Lift', ascending=False).head()

Unnamed: 0,Title 1,Title 2,Support,Confidence,Lift
7,Green Lantern,,0.00586,0.37288,4.70081
1,Green Lantern,Star Wars,0.00586,0.37288,4.70081
20,The Spy Who Dumped Me,,0.00799,0.27149,4.13077
4,The Spy Who Dumped Me,Spiderman 3,0.00799,0.27149,4.12241
29,Kung Fu Panda,,0.00639,0.39344,4.00435


In [50]:
movie_suggestion.loc[movie_suggestion['Title 1']=='Moana']

Unnamed: 0,Title 1,Title 2,Support,Confidence,Lift
19,Moana,Tomb Rider,0.00719,0.203,3.0825
31,Moana,,0.00719,0.203,3.08876
