[Reference](https://medium.com/analytics-vidhya/also-view-recommendation-system-with-graph-theory-e2f098455519)

In [1]:
import numpy as np
import pandas as pd
import itertools 
import math
import time
from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt
import community.community_louvain as community_louvain

data_path = '../data/ml-100k/u.data'

# load train and test data
df = pd.read_csv(data_path, delimiter = '\t', names = ['userid', 'itemid', 'rating', 'timestamp'])
df = df[['userid', 'itemid', 'rating']]

# To build the graph with high correlation and reduce the complexity of the network, 
# we set the minimum rating to be 5.
min_rating= 5
rated_movie = df.drop(df[df['rating']<min_rating].index)

rated_movie.head()

In [2]:
user_itemlist = rated_movie.groupby('userid')['itemid'].apply(list)

edge_dict = defaultdict(lambda: 0)
for item_list in user_itemlist:
    item_list = sorted(item_list)
    pairs = itertools.combinations(item_list, 2)
    for pair in list(pairs):
        edge_dict[pair] += 1
        
edges = [tuple([e[0], e[1], edge_dict[e]]) for e in edge_dict]

print(len(edge_dict.keys()))
print(edges[:5])

In [3]:
g= nx.Graph()
g.add_weighted_edges_from(edges)
print("Total number of graph nodes:", g.number_of_nodes())
print("Total number of graph edges:", g.number_of_edges())

degrees = []
for node in g.nodes:
    degrees.append(g.degree[node])

print("Average node degree:", round(sum(degrees) / len(degrees), 2))

partitions = community_louvain.best_partition(g)
values = list(partitions.values())
print('Number of communities:', len(np.unique(values)))

In [4]:
category_col = ["unknown","Action","Adventure","Animation","Children's","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western"]

column_arr = ["movie id","movie title","release date","video release date","IMDb URL"] + category_col
item_data = pd.read_csv('../data/ml-100k/u.item', delimiter = '|', names =column_arr, encoding='latin1')

item_dict = defaultdict(lambda:[])
item_data = item_data.to_numpy()

for d in item_data:
    res = []
    for indx in range(2,len(d)):
        if d[indx] == 1:
            res.append(column_arr[indx])
    
    item_dict[d[0]] = res

res_dict = defaultdict(lambda: [])
for k in partitions:
    res_dict[partitions[k]].append(item_dict[k])

# print the first cluster
res_dict[0]