In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import networkx.algorithms.bipartite as bipartite

In [2]:
rating = pd.read_csv('/home/alainkuiete/Documents/DATA612/rating.csv')
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [3]:
rating.userId = [str(n)+'U' for n in rating.userId]
rating.movieId = [str(n)+'M' for n in rating.movieId]

In [4]:
rating.shape

(20000263, 4)

## Create a bipartite graph

### Add weighted edges between opposite nodes

In [5]:
G = nx.from_pandas_edgelist(rating, 'userId', 'movieId', 'rating')

In [6]:
nx.is_connected(G)

True

### Add nodes with node attribute

In [7]:
G.add_nodes_from(set(rating.userId), bipartite = 0)
G.add_nodes_from(set(rating.movieId), bipartite = 1)

In [8]:
print(bipartite.is_bipartite(G))

True


In [9]:
#edglist = [(wiki.node1[wiki.index[n]], wiki.node2[wiki.index[n]]) for n in range(0, len(wiki.index))]

In [10]:
#B.add_edges_from(edglist)

### The two different subgraphs

In [11]:
user, movie = bipartite.sets(G)
print("Number of user nodes: ", len(user))
print("Number of movie nodes: ", len(movie))


Number of user nodes:  138493
Number of movie nodes:  26744


In [12]:
user_nodes = {n for n, d in G.nodes(data=True) if d['bipartite']==0}
movie_nodes = set(G) - user_nodes

In [13]:
#print(user)

In [14]:
bipartite.density(G, user)

0.0053998478135544505

In [15]:
bipartite.density(G, movie)

0.0053998478135544505

In [None]:
u = bipartite.projected_graph(G, user)
m = bipartite.projected_grapg(G, movie)

### Components with more than 5 connections

In [None]:
[len(c) for c in list(g.subgraph(b) for b in nx.connected_components(g)) if len(c) > 5]

## Island Method for Users

In [None]:
def trim_edges(g, weight=1):
    g2=nx.Graph()
    for f, to, edata in g.edges(data=True):
        if edata['rating'] > weight:
            g2.add_edge(f, to, weight = edata)
    return g2

In [None]:
def island_method(g, iterations=5):
    weights= [edata['rating'] for f,to,edata in g.edges(data=True)]
    mn=int(min(weights))
    mx=int(max(weights))
    #compute the size of the step, so we get a reasonable step in iterations
    step=int((mx-mn)/iterations)
    return [[threshold, trim_edges(g, threshold)] for threshold in range(mn,mx,step)]

In [None]:
cc=list([u.subgraph(n) for n in nx.connected_components(u)])[0]
islands=island_method(cc)
for i in islands:
    # print the threshold level, size of the graph, and number of connected components
    print(i[0], len(list(i[1])), len(list(u.subgraph(n) for n in nx.connected_components(i[1])))) 

### Distribution of Degrees Centrality of the Network u

In [None]:
plt.hist(list(nx.degree_centrality(u).values()))
plt.show()

### Nodes with high degree centrality

In [None]:
def nodes_with_highest_deg_cent(h):
    deg_cent = nx.degree_centrality(h)
    max_dc = max(list(deg_cent.values()))
    nodes = set()
    for k,v in deg_cent.items():
        if v == max_dc:
            nodes.add(k)
    return nodes

In [None]:
top_dc = nodes_with_highest_deg_cent(u)

In [None]:
print(top_dc)

### Distribution of betweenness centrality

In [None]:
plt.hist(list(nx.betweenness_centrality(u).values()))
plt.show()

### Nodes with hgh betweeness centrality

In [None]:
def nodes_with_highest_bet_cent(h):
    deg_cent = nx.betweenness_centrality(h)
    max_dc = max(list(deg_cent.values()))
    nodes = set()
    for k,v in deg_cent.items():
        if v == max_dc:
            nodes.add(k)
    return nodes

In [None]:
top_bet = nodes_with_highest_bet_cent(u)

In [None]:
print(top_bet)

### Eigenvector Centrality

In [None]:
eigen_cent = nx.eigenvector_centrality(u)

In [None]:
# Assign each to an attribute in your network
nx.set_node_attributes(u, eigen_cent, 'eigenvector')

In [None]:
sorted_eigenvector = sorted(eigen_cent.items(), key=itemgetter(1), reverse=True)

In [None]:
print("Top 10 nodes by Eigenvector centrality:")
for e in sorted_eigenvector[:10]:
    print(e,  nx.get_node_attributes(u, 'eigenvector')[e[0]])

### Users that are totaly connected

In [None]:
largest_clique = sorted(nx.find_cliques(u), key=lambda x:len(x))[-1]

In [None]:
[len(c) for c in list(g.subgraph(b) for b in nx.connected_components(g)) if len(c) > 1]