### Importing libraries and modules

In [66]:
import networkx as nx
import os
import matplotlib.pyplot as plt
from networkx.algorithms import bipartite
from collections import Counter
from networkx.linalg.graphmatrix import adjacency_matrix

from statistics import mean

# Loading data pathes and I/O functions from script
from scripts.io import load_movie_titles, load_raw_bipartite, save_projection, load_projection, save_edgelist, projection_path

### Loading bipartite graph and movie titles

In [67]:
title_dict, node_dict = load_movie_titles("movie-titles.txt")
G = load_raw_bipartite("full_bipartite.p")

# Split the graph into 2 sets: user and movie nodes
user_nodes, movie_nodes = nx.algorithms.bipartite.basic.sets(G)

Graph loaded.


### Simple weights projections

In [68]:
# Projecting on users
simple_weights_users_path = "simple_weights_users.p"

if os.path.exists(projection_path+simple_weights_users_path):
    simple_weights_users = load_projection(simple_weights_users_path)
else:
    simple_weights_users = bipartite.weighted_projected_graph(G, user_nodes, ratio=True)
    save_projection(simple_weights_users, simple_weights_users_path)

# Projecting on movies
simple_weights_movies_path = "simple_weights_movies.p"

if os.path.exists(projection_path+simple_weights_movies_path):
    simple_weights_movies = load_projection(simple_weights_movies_path)
else:
    simple_weights_movies = bipartite.weighted_projected_graph(G, movie_nodes, ratio=True)
    save_projection(simple_weights_movies, simple_weights_movies_path)

Projection loaded.
Projection loaded.


### Research Question
How can we recommend movies, given movies you like?

#### Idea for an algorithm where projections take ratings into account in some way:
1. Get input of movies you like, M, which have ratings by users U
2. Project U and M onto U to get U1
3. Project U1, and all movies they rated, onto the movies to get M1
4. Recommend, from M1, the highest weight neighbor(s) of M.

#### Other ideas:
* Sample users who liked the movie(s), project onto their rated movies, recommend highest weight neighbor of liked movies.  
* Construct similar movies to the ones you like, use this to find users like you, and iterate to converge on movies you will like.  

Potential problem: if our sample of users is too large, we are likely to just recommend the most rated movies, not specific movies you would like. To test this, we could plot correlation between movie degree and likelihood to recommend.  

Does backboning M1 improve recommendations?  

How can we take movie genre into account for recommendation?  

Are our movie recommendations associated with genre? i.e. does M1 have high genre
homophily?  

Are our recommendations largely popular or niche movies and why?  

For evaluation, can we use cross validation by comparing users’ ratings to how likely we are to recommend each rated movie, based on a sample of movies they like?  

# Our Projection Methods

## Rating Allocation
Projection algorithm inspired by resource allocation, where directed edge weight from movie1 to movie2 is computed by summing over all users who rated both movies, multiplying their rating of movie1, normalized by all movie1's ratings, with their rating of movie2, normalized by the users total ratings.

$$RA_{m1,m2} = \sum_{u \in N_{m1} \cap N_{m2}} \frac{w_{m1,u}}{\sum{w_{m1}}}\frac{w_{u,m2}}{\sum{w_{u}}}$$ 

this is computed for all $m1,m2$ pairs of movies, excluding self-loops, to produce a directed movie graph.

An issue with this approach: Ratings below average (e.g. 1) increase weight compared to no rating, which seems intuitively wrong.
Could this be solved by assuming no rating = average rating?



In [69]:
def find_common_neighbors(u, v):
    N_u = G.neighbors(u)
    N_v = G.neighbors(v)
    return set(N_u) & set(N_v)

def rating_allocation_edge_weight(G, u, v, degree_u):
    N_u_v = find_common_neighbors(u,v)
    weight_u_v = 0
    for n in N_u_v:
        w_u_n = G.get_edge_data(u,n)['weight']
        w_n_v = G.get_edge_data(n,v)['weight']
        if w_u_n <= 3 or w_n_v <= 3:
            weight_invariant = -1
        else:
            weight_invariant = 1
        weight_u_v += weight_invariant * (w_u_n / degree_u * w_n_v / G.degree(n))
    return weight_u_v

def rating_allocation_projection(G, movie_nodes):
    rating_allocation_graph = nx.DiGraph()
    for u in movie_nodes:
        degree_u = G.degree(u)
        for v in movie_nodes:

            # Prevent self-loops
            if v == u:
                continue

            edge_weight = rating_allocation_edge_weight(G, u, v, degree_u)
            rating_allocation_graph.add_edge(u, v, weight=edge_weight)
            
    return rating_allocation_graph

In [70]:
# Saving/loading rating allocation projection on movies
rating_allocation_movies_path = "rating_allocation_movies.p"

if os.path.exists(projection_path+rating_allocation_movies_path):
    rating_allocation_movies = load_projection(rating_allocation_movies_path)
else:
    rating_allocation_movies = rating_allocation_projection(G, movie_nodes)
    save_projection(rating_allocation_movies, rating_allocation_movies_path)



Projection loaded.


## Saving edge lists for visualization with Gephi

In [71]:
# TODO: Getting 50 highest degree movies

# TODO: Pass these to save_edgelist
# save_edgelist(50, rating_allocation_movies, "rating_allocation_movies_edges", title_dict, overwrite=True)
# save_edgelist(50, simple_weights_movies, "simple_weights_movies_edges", title_dict)

## Recommendation

In [72]:
# liked_movie_list = ['Die Hard (1988)', 'Star Wars (1977)']
# graph = rating_allocation_movies
# liked_movie_node_list = [node_dict[liked_movie_title] for liked_movie_title in liked_movie_list]
# neighbors_weights = dict()
# for liked_movie_node in liked_movie_node_list:  # For each of the liked movies
#     for node, neighbor, attr_dict in graph.edges(liked_movie_node, data=True):  # For each of its edges
        
#         # Avoid edges to liked movies
#         if neighbor in liked_movie_node_list:
#             continue
        
#         # Append weight to neighbor to dict of all weights
#         neighbors_weights.setdefault(neighbor, []).append(attr_dict['weight'])  

# # Average over all weights for each movie
# avg_neighbors_weights = [(node, mean(weights)) for node, weights in neighbors_weights.items()]


In [73]:
# # To recommend, find highest weight movie from movie you like

# def k_highest_weight_neighbors(k, liked_movie_title, graph):
    
#     liked_movie_node = node_dict[liked_movie_title]
#     edges = list(graph.edges(liked_movie_node, data=True))

#     neighbors_weights = [(neighbor, weight['weight']) for node, neighbor, weight in edges]
#     neighbors_weights = sorted(neighbors_weights, reverse=True, key=lambda x: x[1])

#     neighbors_weights_dict = dict((node, find_neighbor_weights(edges[node])) for node in edges.keys())

#     n_neighbors = [(title_dict[neighbor],weight) for neighbor, weight in neighbors_weights][:k]

#     print(f"{k} highest weight neighbors of '{liked_movie_title}':")
#     return n_neighbors

# k_highest_weight_neighbors(10, 'Die Hard (1988)', rating_allocation_movies)

In [74]:
#rating_allocation_movies.edges(data=True)

In [75]:
liked_movie_list = ["Godfather, The (1972)"]
# print(edges)

In [76]:
def get_edges(liked_movie_list,d_graph):
    liked_movie_node_list = [node_dict[liked_movie_title] for liked_movie_title in liked_movie_list]
    edges = dict((liked_movie_node, list(d_graph.edges(liked_movie_node, data=True))) for liked_movie_node in liked_movie_node_list)
    return edges

edges = get_edges(liked_movie_list, rating_allocation_movies)

def get_average_weight_per_movie(edges):
    node_weights = dict()
    for node, edges_list in edges.items():
        for edge in edges_list:
            if edge[1] in edges.keys():
                continue
            if edge[1] in node_weights:
                node_weights[edge[1]].append(edge[2]['weight'])
            else:
                node_weights[edge[1]] = [edge[2]['weight']]
        average_weights = [(node,mean(weights)) for node, weights in node_weights.items()]
    return average_weights

def sort_average_weight(average_weight_edges):
    return sorted(average_weight_edges, reverse=True, key=lambda x: x[1])

def k_recommend_from_list(k, rating_allocation_movies, liked_movie_list):
    edges = get_edges(liked_movie_list, rating_allocation_movies)
    sorted_average_weights = sort_average_weight(get_average_weight_per_movie(edges))
    n_neighbors = [(title_dict[neighbor],weight) for neighbor, weight in sorted_average_weights][:k]
    return n_neighbors
print(k_recommend_from_list(10,rating_allocation_movies, liked_movie_list))

[('Star Wars (1977)', 0.1297339030994218), ('Fargo (1996)', 0.10508866438488587), ('Return of the Jedi (1983)', 0.06020334698017408), ('Boot, Das (1981)', 0.060159506308073867), ('Silence of the Lambs, The (1991)', 0.05639760334732199), ('Toy Story (1995)', 0.054284909760981166), ('Raiders of the Lost Ark (1981)', 0.05358820339586955), ('Shawshank Redemption, The (1994)', 0.0505984353433188), ('Godfather: Part II, The (1974)', 0.049942158867375686), ('English Patient, The (1996)', 0.04812258467726199)]


In [77]:
#k = 10
#n_neighbors = [(title_dict[neighbor],weight) for neighbor, weight in sorted_average_weights][:k]
#print(n_neighbors)

In [78]:
# For multiple likes movies, find highest average weight movie