### Importing libraries and modules

In [1]:
import networkx as nx
import os
import matplotlib.pyplot as plt
from networkx.algorithms import bipartite
from collections import Counter
from networkx.linalg.graphmatrix import adjacency_matrix

from statistics import mean

# Loading data pathes and I/O functions from script
from scripts.io import load_movie_titles, load_raw_bipartite, save_projection, load_projection, projection_path

### Loading bipartite graph and movie titles

In [2]:
title_dict, node_dict = load_movie_titles()
G = load_raw_bipartite()

# Split the graph into 2 sets: user and movie nodes
user_nodes, movie_nodes = nx.algorithms.bipartite.basic.sets(G)

Graph loaded.


### Simple weights projections

In [3]:
# Projecting on users
simple_weights_users_path = "simple_weights_users.p"

if os.path.exists(projection_path+simple_weights_users_path):
    simple_weights_users = load_projection(simple_weights_users_path)
else:
    simple_weights_users = bipartite.weighted_projected_graph(G, user_nodes)
    save_projection(simple_weights_users, simple_weights_users_path)

# Projecting on movies
simple_weights_movies_path = "simple_weights_movies.p"

if os.path.exists(projection_path+simple_weights_movies_path):
    simple_weights_movies = load_projection(simple_weights_movies_path)
else:
    simple_weights_movies = bipartite.weighted_projected_graph(G, movie_nodes)
    save_projection(simple_weights_movies, simple_weights_movies_path)

### Research Question
How can we recommend movies, given movies you like?

#### Idea for an algorithm where projections take ratings into account in some way:
1. Get input of movies you like, M, which have ratings by users U
2. Project U and M onto U to get U1
3. Project U1, and all movies they rated, onto the movies to get M1
4. Recommend, from M1, the highest weight neighbor(s) of M.

#### Other ideas:
* Sample users who liked the movie(s), project onto their rated movies, recommend highest weight neighbor of liked movies.  
* Construct similar movies to the ones you like, use this to find users like you, and iterate to converge on movies you will like.  

Potential problem: if our sample of users is too large, we are likely to just recommend the most rated movies, not specific movies you would like. To test this, we could plot correlation between movie degree and likelihood to recommend.  

Does backboning M1 improve recommendations?  

How can we take movie genre into account for recommendation?  

Are our movie recommendations associated with genre? i.e. does M1 have high genre
homophily?  

Are our recommendations largely popular or niche movies and why?  

For evaluation, can we use cross validation by comparing users’ ratings to how likely we are to recommend each rated movie, based on a sample of movies they like?  

# Our Projection Methods

## Rating Allocation
Projection algorithm inspired by resource allocation, where directed edge weight from movie1 to movie2 is computed by summing over all users who rated both movies, multiplying their rating of movie1, normalized by all movie1's ratings, with their rating of movie2, normalized by the users total ratings.

$$RA_{m1,m2} = \sum_{u \in N_{m1} \cap N_{m2}} \frac{w_{m1,u}}{\sum{w_{m1}}}\frac{w_{u,m2}}{\sum{w_{u}}}$$ 

this is computed for all $m1,m2$ pairs of movies, excluding self-loops, to produce a directed movie graph.

An issue with this approach: Ratings below average (e.g. 1) increase weight compared to no rating, which seems intuitively wrong.
Could this be solved by assuming no rating = average rating?



In [4]:
def find_common_neighbors(u, v):
    N_u = G.neighbors(u)
    N_v = G.neighbors(v)
    return set(N_u) & set(N_v)

def rating_allocation_edge_weight(G, u, v, degree_u):
    N_u_v = find_common_neighbors(u,v)
    weight_u_v = 0
    for n in N_u_v:
        w_u_n = G.get_edge_data(u,n)['weight']
        w_n_v = G.get_edge_data(n,v)['weight']
        weight_u_v += w_u_n / degree_u * w_n_v / G.degree(n)
    return weight_u_v

# To build directed movie recommendation graph:
# For each movie u
#    For each other movie v
#       Compute rating resource allocation weight from u to v
#       add it as directed edge (u,v)

def rating_allocation_projection(G, movie_nodes):
    rating_allocation_graph = nx.DiGraph()
    for u in movie_nodes:
        degree_u = G.degree(u)
        for v in movie_nodes:

            # Prevent self-loops
            if v == u:
                continue

            edge_weight = rating_allocation_edge_weight(G, u, v, degree_u)
            rating_allocation_graph.add_edge(u, v, weight=edge_weight)
            
    return rating_allocation_graph

In [5]:
# Saving/loading rating allocation projection on movies
rating_allocation_movies_path = "rating_allocation_movies.p"

if os.path.exists(projection_path+rating_allocation_movies_path):
    rating_allocation_movies = load_projection(rating_allocation_movies_path)
else:
    rating_allocation_movies = rating_allocation_projection(G, movie_nodes)
    save_projection(rating_allocation_movies, rating_allocation_movies_path)

In [6]:
def find_neighbor_weights(edges):
    neighbors_weights = [(neighbor, weight['weight']) for node, neighbor, weight in edges]
    neighbors_weights = sorted(neighbors_weights, reverse=True, key=lambda x: x[1])

In [29]:
liked_movie_list = ['Die Hard (1988)', 'Star Wars (1977)']
graph = rating_allocation_movies
liked_movie_node_list = [node_dict[liked_movie_title] for liked_movie_title in liked_movie_list]
neighbors_weights = dict()
for liked_movie_node in liked_movie_node_list:  # For each of the liked movies
    for node, neighbor, attr_dict in graph.edges(liked_movie_node, data=True):  # For each of its edges
        
        # Avoid edges to liked movies
        if neighbor in liked_movie_node_list:
            continue
        
        # Append weight to neighbor to dict of all weights
        neighbors_weights.setdefault(neighbor, []).append(attr_dict['weight'])  

# Average over all weights for each movie
avg_neighbors_weights = [(node, mean(weights)) for node, weights in neighbors_weights.items()]


[(1, 0.09899716175782361),
 (2, 0.020199970070322284),
 (3, 0.011211138724739597),
 (4, 0.03507122266062171),
 (5, 0.011628287799558216),
 (6, 0.003760808984373667),
 (7, 0.07833286062731258),
 (8, 0.03839656277763042),
 (9, 0.048867055356117184),
 (10, 0.011599036714873575),
 (11, 0.04893559550480417),
 (12, 0.05481812098416876),
 (13, 0.025666500238021202),
 (14, 0.024263472711662563),
 (15, 0.04978531309596232),
 (16, 0.004707139790637541),
 (17, 0.014912498434723627),
 (18, 0.0006223350608975619),
 (19, 0.008951565283129492),
 (20, 0.007557478467646412),
 (21, 0.010174132937567056),
 (22, 0.06727880679744852),
 (23, 0.03220716290663929),
 (24, 0.03354672491102956),
 (25, 0.043184855237145076),
 (26, 0.007486175684078295),
 (27, 0.008503402788532368),
 (28, 0.055874603443601484),
 (29, 0.012593824331071016),
 (30, 0.0036732051328968902),
 (31, 0.026868890146183655),
 (32, 0.011816475317034156),
 (33, 0.014790886110886088),
 (34, 0.00067574582468534),
 (35, 0.0013324482633992635),
 (

In [1]:
# To recommend, find highest weight movie from movie you like
def k_recommendations_from_single_movie(k, liked_movie_title, graph):
    liked_movie_node = node_dict[liked_movie_title]
    edges = list(graph.edges(liked_movie_node, data=True))
    neighbors_weights = [(neighbor, weight['weight']) for node, neighbor, weight in edges]
    neighbors_weights = sorted(neighbors_weights, reverse=True, key=lambda x: x[1])
    n_neighbors = [(title_dict[neighbor],weight) for neighbor, weight in neighbors_weights][:k]
    # print(f"{k} highest weight neighbors of '{liked_movie_title}':")
    return n_neighbors



def get_edges(liked_movie_list,graph):
    liked_movie_node_list = [node_dict[liked_movie_title] for liked_movie_title in liked_movie_list]
    edges = dict((liked_movie_node, list(graph.edges(liked_movie_node, data=True))) for liked_movie_node in liked_movie_node_list)
    return edges

def get_average_weight_per_movie(edges, d_graph):
    average_weight_edges = []
    average_weight = 0
    for index in range(len(d_graph) - 1):
        for node in edges.keys():
            average_weight += edges[node][index][2]['weight']
        average_weight = average_weight / len(edges.keys())
        average_weight_edges.append((index+1, average_weight))

    return average_weight_edges

def sort_average_weight(average_weight_edges):
    return sorted(average_weight_edges, reverse=True, key=lambda x: x[1])

# For multiple likes movies, find highest average weight movie
def k_recommendations_from_movies(k, liked_movie_list, graph):
    # liked_movie_node = node_dict[liked_movie_title]
    # edges = list(graph.edges(liked_movie_node, data=True))
    # neighbors_weights = [(neighbor, weight['weight']) for node, neighbor, weight in edges]
    # neighbors_weights = sorted(neighbors_weights, reverse=True, key=lambda x: x[1])
    # n_neighbors = [(title_dict[neighbor],weight) for neighbor, weight in neighbors_weights][:k]
    # print(f"{k} highest weight neighbors of '{liked_movie_title}':")



    edges = get_edges(liked_movie_list, graph)

    sorted_average_weights = sort_average_weight(get_average_weight_per_movie(edges, rating_allocation_movies))

    n_neighbors = [(title_dict[neighbor],weight) for neighbor, weight in sorted_average_weights][:k]

    return n_neighbors

# k_recommendations_from_single_movie(10, 'Die Hard (1988)', rating_allocation_movies)

liked_movie_list = ["Die Hard (1988)", "Star Wars (1977)", "Apocalypse Now (1979)","Princess Bride, The (1987)","Raiders of the Lost Ark (1981)"]
k_recommendations_from_movies(10, liked_movie_list, rating_allocation_movies)

NameError: name 'rating_allocation_movies' is not defined

In [18]:


# print(get_average_weight_per_movie(edges, rating_allocation_movies))


[('Apocalypse Now (1979)', 0.1268666495338228), ('Princess Bride, The (1987)', 0.11568724061392474), ('Star Wars (1977)', 0.10299898189477234), ('Silence of the Lambs, The (1991)', 0.09566661607875082), ('Empire Strikes Back, The (1980)', 0.09512541949980965), ('Pulp Fiction (1994)', 0.08462304798656134), ('This Is Spinal Tap (1984)', 0.08374503841669452), ('Toy Story (1995)', 0.08322545431756363), ('Unforgiven (1992)', 0.08322457022924737), ('Men in Black (1997)', 0.08293179616638668)]


In [None]:
# 