In [1]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
import re

In [2]:
url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"
movies_url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.item"
ratings = pd.read_csv(url, sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Load movie metadata
movies = pd.read_csv(movies_url, sep='|', encoding='latin-1', header=None, usecols=[0, 1])
movies.columns = ['movie_id', 'title']


In [3]:
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [4]:
data = pd.merge(movies, ratings, on='movie_id', how='inner')

In [5]:
data

Unnamed: 0,movie_id,title,user_id,rating,timestamp
0,1,Toy Story (1995),308,4,887736532
1,1,Toy Story (1995),287,5,875334088
2,1,Toy Story (1995),148,4,877019411
3,1,Toy Story (1995),280,4,891700426
4,1,Toy Story (1995),66,3,883601324
...,...,...,...,...,...
99995,1678,Mat' i syn (1997),863,1,889289570
99996,1679,B. Monkey (1998),863,3,889289491
99997,1680,Sliding Doors (1998),863,2,889289570
99998,1681,You So Crazy (1994),896,3,887160722


In [6]:
data = data[data['rating'] >=3]

In [7]:
len(data)

82520

In [8]:
data = data[['user_id', 'title']]

In [9]:
def clean_data(row):
    if row == None:
        return None

    row = re.findall(r"[a-zA-Z\s]", row)
    row = "".join(row)
    row = row.strip()
    row = row.lower()
    return row


data['title'] = data['title'].apply(clean_data)

data['title'][0]

'toy story'

In [10]:
data.head()

Unnamed: 0,user_id,title
0,308,toy story
1,287,toy story
2,148,toy story
3,280,toy story
4,66,toy story


In [11]:
data[data['user_id'] == 1]

Unnamed: 0,user_id,title
129,1,toy story
483,1,goldeneye
657,1,four rooms
807,1,get shorty
927,1,copycat
...,...,...
40715,1,unknown
40799,1,chasing amy
41061,1,full monty the
41321,1,gattaca


In [12]:
data_agg = data.groupby(['user_id'])['title'].unique().agg(list).reset_index()

  data_agg = data.groupby(['user_id'])['title'].unique().agg(list).reset_index()


In [13]:
data_agg 

Unnamed: 0,user_id,title
0,1,"[toy story, goldeneye, four rooms, get shorty,..."
1,2,"[toy story, mighty aphrodite, postino il, anto..."
2,3,"[return of the jedi, event horizon, chasing am..."
3,4,"[seven seen, star wars, indiana jones and the ..."
4,5,"[toy story, goldeneye, from dusk till dawn, mu..."
...,...,...
938,939,"[dead man walking, mr hollands opus, diaboliqu..."
939,940,"[twelve monkeys, babe, dead man walking, usual..."
940,941,"[toy story, twelve monkeys, mr hollands opus, ..."
941,942,"[crimson tide, star wars, lion king the, fugit..."


## Model

In [14]:
te = TransactionEncoder()

In [15]:
te_encode = te.fit(data_agg['title']).transform(data_agg['title'])

In [16]:
enc_data = pd.DataFrame(te_encode, columns=te.columns_)

In [17]:
enc_data

Unnamed: 0,Unnamed: 1,a chef in love,a space odyssey,above the rim,absolute power,abyss the,ace ventura pet detective,ace ventura when nature calls,across the sea of time,addams family values,...,world of apu the apur sansar,wrong trousers the,wyatt earp,year of the horse,you so crazy,young frankenstein,young guns,young guns ii,young poisoners handbook the,zeus and roxanne
0,False,False,True,False,False,True,True,False,False,False,...,False,True,False,False,False,True,True,False,False,False
1,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,True,False,False,False,False,False,False,False,...,False,True,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
939,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
940,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
941,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [18]:
model = apriori(enc_data, min_support=0.20, use_colnames=True)

In [19]:
result_df = association_rules(model)[['antecedents', 'consequents', 'confidence']]

In [20]:
result_df['antecedents'] = result_df['antecedents'].apply(lambda x: ', '.join(map(str, x)))
result_df['consequents'] = result_df['consequents'].apply(lambda x: ', '.join(map(str, x)))

In [21]:
result_df

Unnamed: 0,antecedents,consequents,confidence
0,a space odyssey,raiders of the lost ark,0.841880
1,a space odyssey,star wars,0.846154
2,aliens,alien,0.820611
3,alien,raiders of the lost ark,0.840426
4,alien,silence of the lambs the,0.804965
...,...,...,...
939,"terminator the, raiders of the lost ark, retur...","empire strikes back the, star wars",0.920561
940,"star wars, terminator the, return of the jedi","empire strikes back the, raiders of the lost ark",0.875556
941,"terminator the, raiders of the lost ark, star ...","empire strikes back the, return of the jedi",0.841880
942,"terminator the, empire strikes back the","star wars, raiders of the lost ark, return of ...",0.856522


In [26]:
user = "aliens"

for i in range(len(result_df)):
    if user == result_df['antecedents'][i]:
        print(result_df['consequents'][i])
    else:
        continue

alien
raiders of the lost ark
silence of the lambs the
star wars
raiders of the lost ark, star wars


In [27]:
result_df.to_csv('movies_recommendation.csv')
