### Importing libraries and modules

In [2]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import pickle
import matplotlib.pyplot as plt
from networkx.algorithms import bipartite
from collections import Counter
from networkx.linalg.graphmatrix import adjacency_matrix

# Loading data pathes and I/O functions from script
from scripts.io import save_figure, load_movie_titles, load_raw_bipartite

### Loading bipartite graph and movie titles

In [3]:
title_dict, node_dict = load_movie_titles()
G = load_raw_bipartite()

Graph loaded.


### Simple weights projection

In [5]:
# Split the graph into 2 sets: user and movie nodes
user_nodes, movie_nodes = nx.algorithms.bipartite.basic.sets(G)

# Generate the weighted projections
weighted_projection_users = bipartite.weighted_projected_graph(G, user_nodes)
weighted_projection_movie = bipartite.weighted_projected_graph(G, movie_nodes)

### Research Question
How can we recommend movies, given movies you like?

#### Idea for an algorithm where projections take ratings into account in some way:
1. Get input of movies you like, M, which have ratings by users U
2. Project U and M onto U to get U1
3. Project U1, and all movies they rated, onto the movies to get M1
4. Recommend, from M1, the highest weight neighbor(s) of M.

#### Other ideas:
* Sample users who liked the movie(s), project onto their rated movies, recommend highest weight neighbor of liked movies.  
* Construct similar movies to the ones you like, use this to find users like you, and iterate to converge on movies you will like.  

Potential problem: if our sample of users is too large, we are likely to just recommend the most rated movies, not specific movies you would like. To test this, we could plot correlation between movie degree and likelihood to recommend.  

Does backboning M1 improve recommendations?  

How can we take movie genre into account for recommendation?  

Are our movie recommendations associated with genre? i.e. does M1 have high genre
homophily?  

Are our recommendations largely popular or niche movies and why?  

For evaluation, can we use cross validation by comparing users’ ratings to how likely we are to recommend each rated movie, based on a sample of movies they like?  

# Our Projection Methods

## Rating Allocation
Projection algorithm inspired by resource allocation, where weight from movie1 to movie2 is computed by summing over all users who rated both movies, multiplying their rating of movie1, normalized by all movie1's ratings, with their rating of movie2, normalized by the users total ratings.

$$RA_{m1,m2} = \sum_{u \in N_{m1} \cap N_{m2}} \frac{w_{m1,u}}{\sum{w_{m1}}}\frac{w_{u,m2}}{\sum{w_{u}}}$$ 

this is computed for all $m1,m2$ pairs of movies, excluding self-loops, to produce a directed movie graph.

An issue with this approach: Ratings below average (e.g. 1) increase weight compared to no rating, which seems intuitively wrong.
Could this be solved by assuming no rating = average rating?



In [None]:
def find_common_neighbors(u, v):
    N_u = G.neighbors(u)
    N_v = G.neighbors(v)
    return set(N_u) & set(N_v)


In [None]:
def find_edge_weight_resource_alloc(u,v,degree_u):
    N_u_v = find_common_neighbors(u,v)

    weight_u_v = 0
    for n in N_u_v:
        w_u_n = G.get_edge_data(u,n)['weight']
        w_n_v = G.get_edge_data(n,v)['weight']
        weight_u_v += w_u_n / degree_u * w_n_v / G.degree(n)

    return weight_u_v


In [None]:
# To build directed movie recommendation graph:
# For each movie u
#    For each other movie v
#       Compute rating resource allocation weight from u to v
#       add it as directed edge (u,v)

def movie_res_alloc(movie_nodes):
    dir_movie_graph = nx.DiGraph()
    for u in movie_nodes:
        degree_u = G.degree(u)
        for v in movie_nodes:

            # Prevent self-loops
            if v == u:
                continue

            res_alloc_weight = find_edge_weight_resource_alloc(u,v,degree_u)
            dir_movie_graph.add_edge(u, v, weight=res_alloc_weight)

    return dir_movie_graph

print(movie_res_alloc(movie_nodes))

# TODO: Save and load graph
    

DiGraph with 1682 nodes and 2827442 edges


In [57]:
def find_neighbor_weights(edges):
    neighbors_weights = [(neighbor, weight['weight']) for node, neighbor, weight in edges]
    neighbors_weights = sorted(neighbors_weights, reverse=True, key=lambda x: x[1])

In [None]:
# To recommend, find highest weight movie from movie you like

def k_highest_weight_neighbors(k, liked_movie_list, graph):
    
    liked_movie_node_list = [node_dict[liked_movie_title] for liked_movie_title in liked_movie_list]
    edges = dict((liked_movie_node, list(graph.edges(liked_movie_node, data=True))) for liked_movie_node in liked_movie_node_list)

    neighbors_weights = [(neighbor, weight['weight']) for node, neighbor, weight in edges]
    neighbors_weights = sorted(neighbors_weights, reverse=True, key=lambda x: x[1])

    neighbors_weights_dict = dict((node, find_neighbor_weights(edges[node])) for node in edges.keys())

    n_neighbors = [(title_dict[neighbor],weight) for neighbor, weight in neighbors_weights][:k]

    print(f"{k} highest weight neighbors of '{liked_movie_title}':")
    return n_neighbors

dir_movie_graph = movie_res_alloc(movie_nodes)
k_highest_weight_neighbors(10, 'Die Hard (1988)', dir_movie_graph)

In [58]:
def get_edges(liked_movie_list,d_graph):
    liked_movie_node_list = [node_dict[liked_movie_title] for liked_movie_title in liked_movie_list]
    edges = dict((liked_movie_node, list(graph.edges(liked_movie_node, data=True))) for liked_movie_node in liked_movie_node_list)
    return edges




In [14]:
liked_movie_list = ["Die Hard (1988)", "Star Wars (1977)", "Apocalypse Now (1979)","Princess Bride, The (1987)","Raiders of the Lost Ark (1981)"]
graph = movie_res_alloc(movie_nodes)

edges = get_edges(liked_movie_list, graph)
# print(edges)

NameError: name 'movie_res_alloc' is not defined

In [64]:
def get_average_weight_per_movie(edges, d_graph):
    average_weight_edges = []
    average_weight = 0
    for index in range(len(d_graph) - 1):
        for node in edges.keys():
            average_weight += edges[node][index][2]['weight']
        average_weight = average_weight / len(edges.keys())
        average_weight_edges.append((index+1, average_weight))

    return average_weight_edges

# print(get_average_weight_per_movie(edges, graph))


[(1, 0.08322545431756363), (2, 0.03663503584275037), (3, 0.016905518012901907), (4, 0.03970479606173031), (5, 0.019289093653544927), (6, 0.006697144691236794), (7, 0.06909950798757532), (8, 0.05658518622999784), (9, 0.05548004223264584), (10, 0.022425641947090513), (11, 0.053252871498619445), (12, 0.07296505041756826), (13, 0.037651894319504955), (14, 0.03260872708684119), (15, 0.05071289413765675), (16, 0.014092757214721485), (17, 0.015949480211908618), (18, 0.004050616669576029), (19, 0.009660169021017292), (20, 0.009334462418976135), (21, 0.011043126569244374), (22, 0.07250456501456308), (23, 0.0538060536102426), (24, 0.03828196779752656), (25, 0.044792438382695976), (26, 0.01823415091382922), (27, 0.011967458818374169), (28, 0.06280237236133943), (29, 0.024684486292579983), (30, 0.010189652552875083), (31, 0.029999484323701635), (32, 0.02107281167324781), (33, 0.018972925753634115), (34, 0.004425977767654492), (35, 0.0019742524214593975), (36, 0.0008924200758269627), (37, 0.0007596

In [65]:
def sort_average_weight(average_weight_edges):
    return sorted(average_weight_edges, reverse=True, key=lambda x: x[1])

sorted_average_weights = sort_average_weight(get_average_weight_per_movie(edges,graph))

In [66]:
k = 10
n_neighbors = [(title_dict[neighbor],weight) for neighbor, weight in sorted_average_weights][:k]
print(n_neighbors)

[('Apocalypse Now (1979)', 0.1268666495338228), ('Princess Bride, The (1987)', 0.11568724061392474), ('Star Wars (1977)', 0.10299898189477234), ('Silence of the Lambs, The (1991)', 0.09566661607875082), ('Empire Strikes Back, The (1980)', 0.09512541949980965), ('Pulp Fiction (1994)', 0.08462304798656134), ('This Is Spinal Tap (1984)', 0.08374503841669452), ('Toy Story (1995)', 0.08322545431756363), ('Unforgiven (1992)', 0.08322457022924737), ('Men in Black (1997)', 0.08293179616638668)]


In [None]:
# For multiple likes movies, find highest average weight movie