## Dataset exploration
Before starting the search for a solution, it's good to take a look at the data and learn how to preprocess it.

In [2]:
# Download and unzip the dataset
# https://gist.github.com/hantoine/c4fc70b32c2d163f604a8dc2a050d5f6
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile

def download_and_unzip(url, extract_to='.'):
    http_response = urlopen(url)
    zipfile = ZipFile(BytesIO(http_response.read()))
    zipfile.extractall(path=extract_to)

download_and_unzip('https://files.grouplens.org/datasets/movielens/ml-100k.zip')

In [29]:
import pandas as pd

data = pd.read_csv('ml-100k/u.data', sep='\t', header=None)
data.head()

Unnamed: 0,0,1,2,3
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [31]:
genres = pd.read_csv('ml-100k/u.genre', sep='|', header=None)
genres.head()

Unnamed: 0,0,1
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4


In [27]:
users = pd.read_csv('ml-100k/u.user', sep='|', header=None)
users.head()

Unnamed: 0,0,1,2,3,4
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [28]:
items = pd.read_csv('ml-100k/u.item', sep='|', header=None, encoding='ANSI').drop(columns=[3])
items.head()

Unnamed: 0,0,1,2,4,5,6,7,8,9,10,...,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [66]:
occupations = pd.read_csv('ml-100k/u.occupation', sep='|', header=None)
o_dict = {v: k for k, v in occupations.to_dict()[0].items()}
o_dict

{'administrator': 0,
 'artist': 1,
 'doctor': 2,
 'educator': 3,
 'engineer': 4,
 'entertainment': 5,
 'executive': 6,
 'healthcare': 7,
 'homemaker': 8,
 'lawyer': 9,
 'librarian': 10,
 'marketing': 11,
 'none': 12,
 'other': 13,
 'programmer': 14,
 'retired': 15,
 'salesman': 16,
 'scientist': 17,
 'student': 18,
 'technician': 19,
 'writer': 20}

The dataset is separated into multiple tables, seems really like an SQL database. Regardless, there are two approaches that can be taken to solve that problem - Classical and GraphML.

Classical - Encode demographics data and given movie as X and rating as y. Train the model to predict y given X.

GraphML - Build a bipartite graph of movies and user and use pytorch-geometic to do the prediction.

In [80]:
def vectorize(user, movie, rating):
    o_vec = [0] * len(o_dict)
    o_vec[o_dict[user[3]]]  = 1
    return [user[1], int(user[2] == 'M')]+o_vec+list(movie.drop([0, 1, 2, 4]))+[rating]

In [142]:
from tqdm import tqdm

d = {}
for i, row in tqdm(data.iterrows(), total=len(data)):
    
    user = users.iloc[row[0]-1]
    movie = items.iloc[row[1]-1]
    rating = row[2]
    vec = vectorize(user, movie, rating)
    d.update({i:vec})

classical = pd.DataFrame.from_dict(d, orient='index')
classical.head()

100%|█████████████████████████████████████████████████████████████████████████| 100000/100000 [02:30<00:00, 665.42it/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,49,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
1,39,0,0,0,0,0,0,0,1,0,...,1,0,0,1,0,0,1,0,0,3
2,25,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,28,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,2
4,47,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


This apporach leads to major data loss, as zipcodes can not be vectorized easily. However, I doubt that there is much correlation between a postal code and cinematic prefrences.

In [125]:
d = {}
for i, user in tqdm(users.iterrows(), total=len(users)):
    id = 'u' + str(user[0])
    age = user[1]
    gender = int(user[2] == 'M')
    occupation = o_dict[user[3]]
    zip = user[4]
    user_mod = [age, gender, occupation, zip]
    d.update({id:user_mod})

users_mod = pd.DataFrame.from_dict(d, orient='index')

d = {}
for i, item in tqdm(items.iterrows(), total=len(items)):
    id = 'i' + str(item[0])
    name = item[1]
    date = item[2]
    tags = item.drop([0, 1, 2, 4])
    item_mod = [name, date] + list(tags)
    d.update({id:item_mod})

items_mod = pd.DataFrame.from_dict(d, orient='index')

d = {}
for i, row in tqdm(data.iterrows(), total=len(data)):
    uid = 'u' + str(row[0])
    iid = 'i' + str(row[1])
    rating = row[2]
    row_mod = [uid, iid, rating]
    d.update({i:row_mod})

data_mod = pd.DataFrame.from_dict(d, orient='index')

display(users_mod)
display(items_mod)
display(data_mod)

100%|█████████████████████████████████████████████████████████████████████████████| 943/943 [00:00<00:00, 10405.35it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1682/1682 [00:00<00:00, 1761.55it/s]
100%|███████████████████████████████████████████████████████████████████████| 100000/100000 [00:07<00:00, 13280.10it/s]


Unnamed: 0,0,1,2,3
u1,24,1,19,85711
u2,53,0,13,94043
u3,23,1,20,32067
u4,24,1,19,43537
u5,33,0,13,15213
...,...,...,...,...
u939,26,0,18,33319
u940,32,1,0,02215
u941,20,1,18,97229
u942,48,0,10,78209


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
i1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
i2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
i3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
i4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
i5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
i1678,Mat' i syn (1997),06-Feb-1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
i1679,B. Monkey (1998),06-Feb-1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
i1680,Sliding Doors (1998),01-Jan-1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
i1681,You So Crazy (1994),01-Jan-1994,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,0,1,2
0,u196,i242,3
1,u186,i302,3
2,u22,i377,1
3,u244,i51,2
4,u166,i346,1
...,...,...,...
99995,u880,i476,3
99996,u716,i204,5
99997,u276,i1090,1
99998,u13,i225,2


This method preserves more data, and allows us to learn on the graph structure using GraphML

Now it is time to save generated data to make it available to future use

In [139]:
import networkx as nx
from networkx.algorithms import bipartite

G = nx.from_pandas_edgelist(data_mod, 0, 1, 2)
nx.set_node_attributes(G, items_mod.to_dict())
nx.set_node_attributes(G, users_mod.to_dict())

In [143]:
import pickle
classical.to_csv('classical.csv')
pickle.dump(G, open('graph.pickle', 'wb'))