In [None]:
import random
import pprint
import math
from collections import defaultdict

# Load the dataset

In [None]:
# record tag frequency across the dataset
movie_tag_freq = defaultdict(set)
user_tag_freq = defaultdict(set)

In [None]:
with open("./datasets/ml-latest-small/movies.csv") as f:
    # Each line is of form: <movieId>,<title>,<genres>
    movies = {}
    for line in f:
        split_line = line.strip().split(",")
        title = split_line[1]
        tags = defaultdict(int)
        tag_list = split_line[2].split("|")
        
        # Some movie titles have a comma in them :(
        if len(split_line) >= 4:
            for i in range(2,len(split_line) - 1):
                title += "," + split_line[i]
            tag_list = split_line[-1].split("|")
        
        
        for tag in tag_list:
            tags[tag.lower()] += 1
            movie_tag_freq[tag.lower()].add(split_line[0])
        
        movies[split_line[0]] = {
            "name": title,
            "tags": tags,
        }
    # first line in file
    del movies["movieId"]

In [None]:
with open("./datasets/ml-latest-small/tags.csv") as f:
    # Each line is of form: <userId>,<movieId>,<tag>,<timestamp>
    # We will only use <movieId>,<tag>
    for line in f:
        split_line = line.strip().split(",")
        if split_line[0] == "userId":
            continue
        tag = split_line[2].lower()
        movies[split_line[1]]["tags"][tag] += 1
        movie_tag_freq[tag].add(split_line[0])

For the ratings datset, we have to convert it to appear similar to rabble data. In this case we need to have input similar to rabble 'Likes'. To get this we:
- Assume all ratings above `split` (scale 0.5 -> 5) are positive
- `split` is the mean rating in the dataset

In [None]:
with open("./datasets/ml-latest-small/ratings.csv") as f:
    # Each line is of form: <userId>,<movieId>,<rating>,<timestamp>
    # We will only use <userId>,<movieId>,<rating>
    users = {}
    sum_ratings = 0
    amount_ratings = 0
    lines = f.readlines()
    for line in lines:
        split_line = line.strip().split(",")
        if split_line[0] == "userId":
            continue
        sum_ratings += float(split_line[2])
        amount_ratings += 1
    split = str(sum_ratings / amount_ratings)
    for line in lines:
        split_line = line.strip().split(",")
        if split_line[0] == "userId":
            continue
        
        if split_line[0] not in users:
            users[split_line[0]] = {
                "pos": [],
                "neg": [],
            }
        
        # We will convert ratings to binary (like/dislike) as that is the ratings used in rabble
        # Assume all ratings above 'split' (scale 0.5 -> 5) are positive
        # 'split' is the mean (3.501 for this dataset) rating
        if split_line[2] >= split:
            users[split_line[0]]["pos"].append(split_line[1])
        else:
            users[split_line[0]]["neg"].append(split_line[1])    

#### A quick eyeball check of a movie object

In [None]:
pp = pprint.PrettyPrinter(indent=4)
random_movie_pos = random.randrange(len(movies))
random_movie_id = [x for x in movies.keys()][random_movie_pos]
print(random_movie_id, end=" ")
pp.pprint(movies[random_movie_id])

# Seperate training and test data
- Each user has rated at least 20 movies. However as rabble only takes positive input we will only use the positive scores to train the user model
- So we will train with roughly 3/4 of the positive ratings and test with the 1/4 positive and all the negative ratings

In [None]:
for u in users.keys():
    cutoff = (len(users[u]["pos"]) * 3) // 4
    shuffled_pos_ratings = users[u]["pos"]
    random.shuffle(shuffled_pos_ratings)
    users[u]["train"] = shuffled_pos_ratings[:cutoff]
    users[u]["test"] = shuffled_pos_ratings[cutoff:]

# Create User Models

Create simple user model by adding any tags related to liked movies to a 'model' dictionary. Every time that tag is seen, the counter related to it is increased. 

In [None]:
for u in users.keys():
    users[u]["model"] = defaultdict(int)
    for movieId in users[u]["train"]:
        for tag in movies[movieId]["tags"].keys():
            user_tag_freq[tag].add(u)
            users[u]["model"][tag] += 1

####  A quick eyeball check of a model

In [None]:
pp = pprint.PrettyPrinter(indent=4)
random_user_id = str(random.randrange(len(users)))
print(random_user_id, end=" ")
pp.pprint(users[random_user_id])

# Similarity function

Calculate similarity based on `TF Cosine-based Similarity` method described in Content-based Recommendation in Social Tagging Systems (4.2) [link](https://dl.acm.org/citation.cfm?id=1864756)

In [None]:
def tf_cosine_similarity(user_model, movie_tags):
    sum_user_item_tf = 0
    sum_user_tf = 0
    sum_item_tf = 0
    tag_amount = 0
    for tag in movie_tags.keys():
        sum_user_item_tf += user_model[tag] * movie_tags[tag]
        sum_user_tf += user_model[tag] ** 2
        sum_item_tf += movie_tags[tag] ** 2
        tag_amount += 1
    divisor = (((sum_user_tf) ** 0.5) * ((sum_item_tf) ** 0.5))
    if divisor == 0:
        return 0
    tf_cosine = sum_user_item_tf / divisor
    return tf_cosine

Calculate similarity based on `TF-IDF Cosine-based Similarity` method describe in Content-based Recommendation in Social Tagging Systems (4.3) [link](https://dl.acm.org/citation.cfm?id=1864756)

Calculate `User-based inverse tag frequency` and `Item-based inverse tag frequency` using `tag_freq`

In [None]:
def calculate_based_itf(tag_freq, N, b_itfs):
    for key in tag_freq.keys():
        occurance = len(tag_freq[key])
        b_itf = math.log(N / occurance)
        b_itfs[key] = b_itf

movie_amount = len(movies)
user_amount = len(users)
user_ifs = defaultdict(int)
movie_ifs = defaultdict(int)

calculate_based_itf(movie_tag_freq, len(movies), movie_ifs)
calculate_based_itf(user_tag_freq, len(users), user_ifs)

In [None]:
def tf_idf_cosine_similarity(user_model, movie_tags):
    sum_user_item_tf = 0
    sum_user_tf = 0
    sum_item_tf = 0
    tag_amount = 0
    for tag in movie_tags.keys():
        sum_user_item_tf += user_model[tag] * user_ifs[tag] * movie_tags[tag] * movie_ifs[tag]
        sum_user_tf += (user_model[tag] * user_ifs[tag]) ** 2
        sum_item_tf += (movie_tags[tag] * movie_ifs[tag])** 2
        tag_amount += 1
    divisor = (((sum_user_tf) ** 0.5) * ((sum_item_tf) ** 0.5))
    if divisor == 0:
        return 0
    tf_cosine = sum_user_item_tf / divisor
    return tf_cosine

#### Eyeball similarity

In [None]:
rand_u_model = users[random_user_id]["model"]
rand_u_train = users[random_user_id]["train"]
rand_u_test = users[random_user_id]["test"]
rand_u_neg = users[random_user_id]["neg"]
pos_train = movies[rand_u_train[random.randrange(len(rand_u_train))]]
pos_test = movies[rand_u_test[random.randrange(len(rand_u_test))]]
neg_movie = movies[rand_u_neg[random.randrange(len(rand_u_neg))]]
print(pos_train)
print(pos_test)
print(neg_movie)


print("TF Similarity")
# pos result
print(tf_cosine_similarity(rand_u_model, pos_train["tags"]))
print(tf_cosine_similarity(rand_u_model, pos_test["tags"]))
# negative
print(tf_cosine_similarity(rand_u_model, neg_movie["tags"]))

print("TF-IDF Similarity")
# pos result
print(tf_idf_cosine_similarity(rand_u_model, pos_train["tags"]))
print(tf_idf_cosine_similarity(rand_u_model, pos_test["tags"]))
# negative
print(tf_idf_cosine_similarity(rand_u_model, neg_movie["tags"]))

# Evaluate

Evaluating based on chapter `3.2.2 Measuring Usage Prediction` in [Recommender Systems Handbook](https://link.springer.com/chapter/10.1007/978-0-387-85820-3_8). Can be gotten [here](http://scholar.google.com/scholar_url?url=http://citeseerx.ist.psu.edu/viewdoc/download%3Fdoi%3D10.1.1.712.4138%26rep%3Drep1%26type%3Dpdf&hl=en&sa=X&scisig=AAGBfm1BpCg0RTunNpmouOfrHuNPul-3NQ&nossl=1&oi=scholarr).

Evaluation metric chosen is area under the Receiving Operator Characteristic curve, as in @iandioch's research.

In [None]:
aucs = []
n1s = []
n2s = []
n3s = []
ns = []

# Column headings.
print('\t\tn1   \tn2   \tn3   \tAUC')

t = 0
for u in users.keys():
    # Code based on @iandioch's common neighbours research
    model = users[u]["model"]

    n1 = 0 # missing_pos > neg
    n2 = 0 # missing_pos = neg
    n3 = 0 # missing_pos < neg
    n = 0 # total link comparisons
    for missing_pos_id in users[u]["test"]:
        a_score = tf_idf_cosine_similarity(model, movies[missing_pos_id]["tags"])
        for neg_id in users[u]["neg"]:
            b_score = tf_idf_cosine_similarity(model, movies[neg_id]["tags"])
            if abs(a_score-b_score) < 0.0005:
                n2 += 1
            elif a_score > b_score:
                n1 += 1
            else:
                n3 += 1
            n += 1
    # Check if there were any comparisons.
    # If not ignore this user model as they have no dislikes/ likes
    if n > 0:
        auc = (n1 + 0.5*n2)/(n)
        aucs.append(auc)
        n1s.append(n1)
        n2s.append(n2)
        n3s.append(n3)
        ns.append(n)
    
    t += 1
    # Eye ball some per user examples
    if t <= 5:
        print('UserID {:<2}:\t{:<5}\t{:<5}\t{:<5}\t{:<.6f}'.format(u, n1, n2, n3, auc))

def avg(seq):
    return sum(seq)/len(seq)

print('Average:\t{:<5}\t{:<5}\t{:<5}\t{:<.6f}'.format(int(round(avg(n1s))), int(round(avg(n2s))), int(round(avg(n3s))), avg(aucs)))