In [77]:
import random
import pprint
from collections import defaultdict

# Load the dataset

In [78]:
with open("./datasets/ml-latest-small/movies.csv") as f:
    # Each line is of form: <movieId>,<title>,<genres>
    movies = {}
    for line in f:
        split_line = line.strip().split(",")
        title = split_line[1]
        tags = set(split_line[2].split("|"))
        
        # Some movie titles have a comma in them :(
        if len(split_line) == 4:
            for i in range(2,len(split_line)):
                title += "," + split_line[i]
            tags = set(split_line[-1].split("|"))
        
        movies[split_line[0]] = {
            "name": title,
            "tags": tags,
        }
    # first line in file
    del movies["movieId"]

In [79]:
with open("./datasets/ml-latest-small/tags.csv") as f:
    # Each line is of form: <userId>,<movieId>,<tag>,<timestamp>
    # We will only use <movieId>,<tag>
    for line in f:
        split_line = line.strip().split(",")
        if split_line[0] == "userId":
            continue
        movies[split_line[1]]["tags"].add(split_line[2])

For the ratings datset, we have to convert it to appear similar to rabble data. In this case we need to have input similar to rabble 'Likes'. To get this we:
- Assume all ratings above `split` (scale 0.5 -> 5) are positive
- `split` is the mean rating in the dataset

In [80]:
with open("./datasets/ml-latest-small/ratings.csv") as f:
    # Each line is of form: <userId>,<movieId>,<rating>,<timestamp>
    # We will only use <userId>,<movieId>,<rating>
    users = {}
    sum_ratings = 0
    amount_ratings = 0
    lines = f.readlines()
    for line in lines:
        split_line = line.strip().split(",")
        if split_line[0] == "userId":
            continue
        sum_ratings += float(split_line[2])
        amount_ratings += 1
    split = str(sum_ratings / amount_ratings)
    for line in lines:
        split_line = line.strip().split(",")
        if split_line[0] == "userId":
            continue
        
        if split_line[0] not in users:
            users[split_line[0]] = {
                "pos": [],
                "neg": [],
            }
        
        # We will convert ratings to binary (like/dislike) as that is the ratings used in rabble
        # Assume all ratings above 'split' (scale 0.5 -> 5) are positive
        # 'split' is the mean (3.501 for this dataset) rating
        if split_line[2] >= split:
            users[split_line[0]]["pos"].append(split_line[1])
        else:
            users[split_line[0]]["neg"].append(split_line[1])    

# Seperate training and test data
- Each user has rated at least 20 movies. However as rabble only takes positive input we will only use the positive scores to train the user model
- So we will train with roughly 3/4 of the positive ratings and test with the 1/4 positive and all the negative ratings

In [81]:
for u in users.keys():
    cutoff = (len(users[u]["pos"]) * 3) // 4
    shuffled_pos_ratings = users[u]["pos"]
    random.shuffle(shuffled_pos_ratings)
    users[u]["train"] = shuffled_pos_ratings[:cutoff]
    users[u]["test"] = shuffled_pos_ratings[cutoff:]

# Create User Models

Create simple user model by adding any tags related to liked movies to a 'model' dictionary. Every time that tag is seen, the counter related to it is increased. 

In [82]:
for u in users.keys():
    users[u]["model"] = defaultdict(int)
    for movieId in users[u]["train"]:
        for tag in movies[movieId]["tags"]:
            users[u]["model"][tag] += 1

In [83]:
# A quick eyeball check of a model.
pp = pprint.PrettyPrinter(indent=4)
random_user_id = str(random.randrange(len(users)))
print(random_user_id, end=" ")
pp.pprint(users[random_user_id]["model"])

51 defaultdict(<class 'int'>,
            {   ' Lies': 1,
                ' the Bad and the Ugly': 1,
                '1950s': 1,
                '1960s': 1,
                'Action': 29,
                'Adventure': 31,
                'Al Pacino': 1,
                'Animation': 6,
                'Arnold Schwarzenegger': 1,
                'Arthur C. Clarke': 1,
                'Australia': 2,
                'Brittany Murphy': 1,
                'Capone': 1,
                'Children': 10,
                'Christina Ricci': 1,
                'Christmas': 1,
                'Christopher Lloyd': 1,
                'Comedy': 68,
                'Crime': 19,
                'Disney': 2,
                'Documentary': 2,
                'Drama': 63,
                'Dull': 1,
                'EPIC': 1,
                'England': 1,
                'Family': 1,
                'Fantasy': 19,
                'Film-Noir': 1,
                'George Lucas': 1,
                'Hal': 1,
   

# Similarity function

For simple method, calculate similarity based on sum of occurrence of all tags related to movie in the user model.

In [84]:
def similarity(user_model, movie_tags):
    total = 0
    for tag in movie_tags:
        total += user_model[tag]
    return total

# Evaluate

Evaluating based on chapter `3.2.2 Measuring Usage Prediction` in [Recommender Systems Handbook](https://link.springer.com/chapter/10.1007/978-0-387-85820-3_8). Can be gotten [here](http://scholar.google.com/scholar_url?url=http://citeseerx.ist.psu.edu/viewdoc/download%3Fdoi%3D10.1.1.712.4138%26rep%3Drep1%26type%3Dpdf&hl=en&sa=X&scisig=AAGBfm1BpCg0RTunNpmouOfrHuNPul-3NQ&nossl=1&oi=scholarr).

Evaluation metric chosen is AUC, as in @iandioch's research.

In [85]:
aucs = []
n1s = []
n2s = []
n3s = []
ns = []

# Column headings.
print('\t\tn1   \tn2   \tn3   \tAUC')

t = 0
for u in users.keys():
    # Code based on @iandioch's common neighbours research
    model = users[u]["model"]

    n1 = 0 # missing_pos > neg
    n2 = 0 # missing_pos = neg
    n3 = 0 # missing_pos < neg
    n = 0 # total link comparisons
    for missing_pos_id in users[u]["test"]:
        a_score = similarity(model, movies[missing_pos_id]["tags"])
        for neg_id in users[u]["neg"]:
            b_score = similarity(model, movies[neg_id]["tags"])
            if abs(a_score-b_score) < 0.0005:
                n2 += 1
            elif a_score > b_score:
                n1 += 1
            else:
                n3 += 1
            n += 1
    # Check if there were any comparisons.
    # If not ignore this user model as they have no dislikes/ likes
    if n > 0:
        auc = (n1 + 0.5*n2)/(n)
        aucs.append(auc)
        n1s.append(n1)
        n2s.append(n2)
        n3s.append(n3)
        ns.append(n)
    
    t += 1
    # Eye ball some per user examples
    if t <= 10:
        print('UserID {:<2}:\t{:<5}\t{:<5}\t{:<5}\t{:<.6f}'.format(u, n1, n2, n3, auc))

def avg(seq):
    return sum(seq)/len(seq)

print('Average:\t{:<5}\t{:<5}\t{:<5}\t{:<.6f}'.format(int(round(avg(n1s))), int(round(avg(n2s))), int(round(avg(n3s))), avg(aucs)))

		n1   	n2   	n3   	AUC
UserID 1 :	816  	16   	768  	0.515000
UserID 2 :	25   	1    	24   	0.510000
UserID 3 :	73   	0    	19   	0.793478
UserID 4 :	1321 	114  	1381 	0.489347
UserID 5 :	64   	9    	53   	0.543651
UserID 6 :	3723 	214  	2255 	0.618540
UserID 7 :	684  	32   	760  	0.474255
UserID 8 :	73   	11   	66   	0.523333
UserID 9 :	67   	6    	71   	0.486111
UserID 10:	448  	48   	674  	0.403419
Average:	2744 	91   	2310 	0.558886
