In [1]:
from dependencies import *
import os
import igraph
from scipy.sparse import csr_matrix
from collections import Counter
import implicit
import networkx as nx

In [2]:
data = load_tweets()

In [3]:
def create_graph(data, user='user', retweeted_user='in_reply_to_screen_name'):
    graph = {}
    
    for i in data.index:
        if data.loc[i][retweeted_user] != None:
            if data.loc[i][user]['screen_name'] not in list(graph.keys()):
                graph[data.loc[i][user]['screen_name']] = [data.loc[i][retweeted_user]]
            elif data.loc[i][retweeted_user] not in graph[data.loc[i][user]['screen_name']]:
                graph[data.loc[i][user]['screen_name']].append(data.loc[i][retweeted_user])
            else:
                continue
    return graph

In [5]:
graph = create_graph(data)
g = igraph.Graph()
g.add_vertices(list(set(list(graph.keys()) + list([a for value in graph.values() for a in value]))))
g.add_edges([(key, value) for key in graph.keys() for value in graph[key]])

print('Nodes: {}\nEdges: {}'.format(len(g.vs), len(g.es)))

Nodes: 11074
Edges: 7678


## Split trian/test

In [7]:
import random
random.seed(1)

p = 0.2
N = len(g.es)
all_idxs = range(N)
test_ids = np.random.choice(a = all_idxs, size = int(p*N), replace=False)
aux = g.copy()
aux.delete_vertices(test_ids)

print('Nodes: {}\nEdges: {}'.format(len(aux.vs), len(aux.es)))

Nodes: 9539
Edges: 5799


In [10]:
def recomendations(graph):
    """
    starting from a graph this function returns all the nodes at distance 2
    """
    

    all_potential_recommendations = set()
    
    for n1 in graph.vs:
        
        # all the nodes at distance 1
        nodes_at_most_distant_1 = set(graph.neighborhood(vertices = n1, order = 1))

        # all the nodes at distance 1 and distance 2
        nodes_at_most_distant_2 = set(graph.neighborhood(vertices = n1, order = 2))
        
        # only the nodes at distance 2
        only_nodes_at_distance_2 = nodes_at_most_distant_2 - nodes_at_most_distant_1
        
        
        # check if empty set
        if len(only_nodes_at_distance_2) > 0:
            

            for n2 in only_nodes_at_distance_2:
                
                # since n1 is an igraph vertex object, we need to extract the id
                n1_index = n1.index
                
                all_potential_recommendations.add((n1_index, n2))
            
    return all_potential_recommendations

In [33]:
def print_top_k(g, topk, vid=None):
        
    pr = enumerate(g.personalized_pagerank(reset_vertices=vid))
    out = sorted(pr, key=lambda tup: tup[1], reverse=True)[:topk]
    
    return out

In [25]:
ground_truth = set()
trainset = set()
for idx, one_edge in enumerate(g.es):
    n1 = one_edge.source
    n2 = one_edge.target
    
    if idx in test_ids:
        ground_truth.add((n1, n2, 1))
    else:
        trainset.add((n1, n2, 1))

In [26]:
all_potential_recommendations = recomendations(aux)
for rec in all_potential_recommendations:
    n1 = rec[0]
    n2 = rec[1]
    ground_truth.add((n1,n2,0))

# PageRank

In [27]:
topkppr = {}
topk = 1
for node in test_ids:
    topkppr[node] = print_top_k(g, 1, vid=node)[0][0]
ppr_df = pd.DataFrame({'node':list(topkppr.keys()), 'recommendation': list(topkppr.values())})
for i in ppr_df.index:
    if ppr_df.loc[i]['node'] == ppr_df.loc[i]['recommendation']:
        ppr_df = ppr_df.drop(i)

df_test = pd.DataFrame(list(ground_truth), columns=["n1","n2", "edge"])
l = []
for i in df_test.index:
    for j in ppr_df.index:
        if df_test.loc[i]['n1'] == ppr_df.loc[j]['node'] and df_test.loc[i]['n2'] == ppr_df.loc[j]['recommendation']:
            l.append(1)
        else:
            l.append(0)
            break

df_test['rating'] = l

right_predictions = len(df_test[df_test['rating']==df_test['edge']])
right_predictions/len(df_test)

0.9758671749472292

# Alternating Least Squares

In [28]:
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
M = g.get_adjacency().data
M = csr_matrix(M)

model = implicit.als.AlternatingLeastSquares(factors=10, calculate_training_loss=True,  iterations=5)
model.fit(M)

df_test = pd.DataFrame(list(ground_truth), columns=["n1","n2", "edge"])
all_predictions = []

for n1,n2, w in df_test.values:
    array_n1 = model.user_factors[n1,:]
    array_n2 = model.item_factors[n2,:]
    one_p = np.dot(array_n1, array_n2)
    all_predictions.append(one_p)


df_test["rating"] = all_predictions
df_test["rating"] = df_test["rating"].apply(lambda x: round(x))

right_predictions = len(df_test[df_test['rating']==df_test['edge']])
right_predictions/len(df_test)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




0.9774739296178444