In [1]:
import pandas as pd


In [2]:
data_all = []
for i in range(4):
    data = pd.read_csv('data/combined_data_' + str(i+1)+'.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])
    data_all.append(data)

In [3]:
df = pd.concat(data_all)

In [4]:
import numpy as np
df.index = np.arange(0,len(df))
print('Full dataset shape: {}'.format(df.shape))
print('-Dataset examples-')
print(df.iloc[::5000000, :])

Full dataset shape: (100498277, 2)
-Dataset examples-
           Cust_Id  Rating
0               1:     NaN
5000000    2560324     4.0
10000000   2271935     2.0
15000000   1921803     2.0
20000000   1933327     3.0
25000000   1465002     3.0
30000000    961023     4.0
35000000   1372532     5.0
40000000    854274     5.0
45000000    116334     3.0
50000000    768483     3.0
55000000   1331144     5.0
60000000   1609324     2.0
65000000   1699240     3.0
70000000   1776418     4.0
75000000   1643826     5.0
80000000    932047     4.0
85000000   2292868     4.0
90000000    932191     4.0
95000000   1815101     3.0
100000000   872339     4.0


In [5]:
df.shape

(100498277, 2)

In [6]:
import numpy as np
df_nan = pd.DataFrame(pd.isnull(df.Rating))
df_nan = df_nan[df_nan['Rating'] == True]
df_nan = df_nan.reset_index()

movie_np = []
movie_id = 1

for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):
    # numpy approach
    temp = np.full((1,np.abs(i-j-1)), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1

# Account for last record and corresponding length
# numpy approach
last_record = np.full((1,len(df) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)

In [7]:
df = df[pd.notnull(df['Rating'])]

df['Movie_Id'] = movie_np.astype(int)
df['Cust_Id'] = df['Cust_Id'].astype(int)
print('-Dataset examples-')
print(df.sample(3))

-Dataset examples-
          Cust_Id  Rating  Movie_Id
7444526   2102481     1.0      1481
85054287  2225308     2.0     15124
73162795   578248     5.0     13255


In [9]:
df.to_csv('data/data_prep.csv', sep=';', index = None)

In [11]:
import pandas as pd
df = pd.read_csv('data/data_prep.csv', sep=';')

In [12]:
df.head()

Unnamed: 0,Cust_Id,Rating,Movie_Id
0,1488844,3.0,1
1,822109,5.0,1
2,885013,4.0,1
3,30878,4.0,1
4,823519,3.0,1


In [13]:
good = df[df['Rating']==5].groupby('Cust_Id')['Movie_Id'].apply(lambda r: ' '.join([str(A) for A in r]))

In [14]:
good.head()

Cust_Id
6     175 457 886 1467 2372 2452 2782 3290 4043 4633...
7     8 30 83 175 257 283 285 313 357 457 458 468 50...
8     1202 1799 1905 2186 3610 3925 4306 5054 5317 5...
10    473 985 1542 1905 2172 3124 3371 3962 4043 430...
25                4432 6786 7605 9326 10643 15107 15270
Name: Movie_Id, dtype: object

In [17]:
import apyori

In [18]:
association_rules = apyori.apriori(good.apply(lambda r: r.split(' ')), 
                                   min_support=0.04, 
                                   min_confidence=0.1, min_lift=2, 
                                   min_length=2)

In [19]:
association_rules

<generator object apriori at 0x0000025EC2645150>

In [20]:
asr_df = pd.DataFrame(columns = ['from', 'to', 'confidence', 'support', 'lift'])
for item in association_rules:
    pair = item[0] 
    items = [x for x in pair]
    asr_df.loc[len(asr_df), :] =  ' '.join(list(item[2][0][0])), \
                                  ' '.join(list(item[2][0][1])),\
                                  item[2][0][2], item[1], item[2][0][3]

In [21]:
asr_df

Unnamed: 0,from,to,confidence,support,lift
0,10042,10820,0.266215,0.041573,4.110222
1,10042,10947,0.351478,0.054888,2.421943
2,10042,11064,0.348135,0.054366,2.300702
3,10042,11089,0.331616,0.051786,2.488266
4,10042,11283,0.454613,0.070994,2.206234
...,...,...,...,...,...
2018,11521,7230 16265 2452 7057 5582 14961,0.199233,0.041485,4.630662
2019,11521,7230 2452 9628 7057 5582 14961,0.195898,0.04079,4.63469
2020,14240,7230 16265 2452 7057 5582 14961,0.208372,0.041711,4.843073
2021,14240,7230 2452 9628 7057 5582 14961,0.204569,0.04095,4.839831


In [22]:
asr_df.sort_values('lift').tail(10)

Unnamed: 0,from,to,confidence,support,lift
1881,14961,7230 7057 16265 2452,0.375413,0.046806,7.368337
2000,14961,7230 9628 7057 5582 16265,0.330554,0.041213,7.380208
1999,14961,7230 2452 7057 5582 16265,0.345086,0.043025,7.395181
325,14454,457,0.631952,0.060757,7.406606
1187,7057,7230 9628,0.449937,0.056266,7.434794
143,11443,17627,0.734164,0.065722,7.958336
768,11443,17627 2452,0.463653,0.041506,8.6031
764,11443,17627 11521,0.465292,0.041653,8.756512
766,11443,17627 12338,0.490663,0.043924,9.478307
313,14302,16147,0.741323,0.040359,12.719342


In [27]:
titles = pd.read_csv('data/movie_titles.csv', encoding = "ISO-8859-1", 
                     header = None, 
                     on_bad_lines='skip',
                     names = ['Movie_Id', 'Year', 'Name'])

In [31]:
def get_rule_title(rule):
    print(titles[titles.Movie_Id.isin(rule['from'].split(' '))]['Name'].values)
    print('----------->')
    print(titles[titles.Movie_Id == int(rule['to'])]['Name'].values)

In [None]:
get_rule_title(asr_df.loc[2022])

In [None]:
get_rule_title(asr_df.sample(1).iloc[0])

In [38]:
good.iloc[14]

'334 1145 1289 1542 2152 2153 2212 2290 3079 3098 3418 4123 4302 4545 4577 4640 5056 5069 5181 5807 5875 6386 6398 7617 9617 9728 10158 10359 11283 13115 13702 14103 14667 15062 15540 15563 16008 16668 17323'

In [39]:
j = 159992



In [40]:
titles[titles.Movie_Id.isin(good.iloc[j].split(' '))]['Name']

Series([], Name: Name, dtype: object)

In [41]:
def print_rule_title(rule):
    return (titles[titles.Movie_Id == int(rule['to'])]['Name'].values)
    

result = []
for A in asr_df.index:
    if len(set(good.iloc[j].split(' ')) & set(asr_df['from'].loc[A].split(' '))) == len(asr_df['from'].loc[A].split(' ')):
        result.append(print_rule_title(asr_df.loc[A])[0])
print(set(result))

{'Dirty Dancing', 'Pretty Woman'}
