In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from matplotlib import pyplot as plt
import timeit

In [2]:
users = [
    {"id":0 , "name":"Hero"},
    {"id":1 , "name":"Dunn"},
    {"id":2 , "name":"Sue"},
    {"id":3 , "name":"Chi"},
    {"id":4 , "name":"Thor"},
    {"id":5 , "name":"Clive"},
    {"id":6 , "name":"Hicks"},
    {"id":7 , "name":"Devin"},
    {"id":8 , "name":"Kate"},
    {"id":9 , "name":"Klein"}
]

friendships = [(0,1),(0,2),(1,2),(1,3),(2,3),(3,4),
              (4,5),(5,6),(5,7),(6,8),(7,8),(8,9)]  # note that friendships are not directed, 
                                                    # then (0,1) and (1,0) would be the same
                                                    # thus here only one option is displayed
                                                    # to avoid duplicates

In [7]:
# Adding the 'friends' property to 'users' dictionary
for user in users:
    user['friends']=[]

for i,j in friendships:
    users[i]['friends'].append(users[j]) # Only adding friend ID to avoid a Circular list
    users[j]['friends'].append(users[i]) # friendship tuples are bidirectional

In [32]:
# Recommending friends of a friend

def friend_of_friend_bad(user):
    return [foaf['id']
            for friend in user["friends"]  #for each of users friends
            for foaf in friend['friends']] #return their friends id 

friend_of_friend_bad(users[0])

[0, 2, 3, 0, 1, 3]

In [33]:
# Counting friends in common with helpers

In [66]:
def not_the_same(user,other_user):
    '''check if the recomended friend is the original user'''
    return user['id'] != other_user['id']

def not_friends(user, other_user):
    '''check if the other user is not part of the original user friends'''
    #return all(not_the_same(friend, other_user)
    #          for friend in user['friends'])                                      #v1 original one
    return other_user['id'] not in [friend['id'] for friend in  user['friends']]   #v2
    

In [68]:
#v1
timeit.timeit('not_friends(users[0],users[5])', setup="from __main__ import not_friends, users")

1.2208759680033836

In [69]:
#v2
timeit.timeit('not_friends(users[0],users[5])', setup="from __main__ import not_friends, users")

1.216281090000848

In [81]:
from collections import Counter

In [85]:
def friends_of_friends(user):
    '''return friends of friends and how many of the user friends have befriend them '''
    return Counter(foaf['id']
           for friend in user["friends"]
           for foaf in friend['friends']
           if not_the_same(user, foaf)
           and not_friends(user,foaf))

In [87]:
print(friends_of_friends(users[3]))

Counter({0: 2, 5: 1})


In [92]:
interests = [
    (0,'Hadop'),(0,'Big Data'),(0,'HBase'),(0,'Java'),
    (0,'Spark'),(0,'Storm'),(0,'Cassandra'),
    (1,'NoSQL'),(1,'MongoDB'),(1,'Cassandra'),(1,'HBase'),
    (1,'Postgres'),(2,'Python'),(2,'scikit-learn'),(2,'scipy'),
    (2,'numpy'),(2,'statsmodels'),(2,'pandas'),(3,'R'),(3,'Python'),
    (3,'statistics'),(3,'regression'),(3,'probability'),
    (4,'machine learning'),(4,'regression'),(4,'decision trees'),
    (4,'libsvm'),(5,'Python'),(5,'R'),(5,'Java'),(5,'C++'),
    (5,'Haskell'),(5,'programming languages'),(6,'statistics'),
    (6,'probability'),(6,'mathematics'),(6,'theory'),
    (7,'machine learning'),(7,'scikit-learn'),(7,'Mahout'),
    (7,'neural networks'),(8,'neural networks'),(8,'deep learning'),
    (8,'Big Data'),(8,'artificial intelligence'),(9,'Haddop'),
    (9,'Java'),(9,'MapReduce'),(9,'Big Data')
]

In [101]:
# how to group people by similar interest?
# Measure how simmilar people interests are and retrieve the ones with more simmilar interests that are not friends
for user in users:
    user['interest'] = []
    
for user,interest in interests:
    users[user]['interest'].append(interest)

In [129]:
### original functions by @Santos
def similar_interest_metric(user, otheruser):
    '''intercept user and otheruser interest and count how many interests they share'''
    return len(set(user["interest"]).intersection(otheruser['interest']))

def new_friends_similar_interests(user,users):
    '''calculate similar interest between user and other users. Recommend descending sorted user ids only if they share interests'''
    return sorted(((otheruser['id'],similar_interest_metric(user,otheruser))
                  for otheruser in users 
                  if not_the_same(user,otheruser)
                  and not_friends(user,otheruser)
                  and (similar_interest_metric(user,otheruser) > 0)
                  ), key=(lambda x: x[1]), reverse=True)
### this solution do not scalate. Poor performance.

new_friends_similar_interests(users[0], users)

[(9, 2), (5, 1), (8, 1)]

In [175]:
### original functions by @Santos
### creating an user-interest table
interest_hash = dict(
    zip(
        sorted(set(i_name for user,i_name in interests)), # get interest sorted
        range(0,len(interest_names)) # index for each interest
    ) # zip them
) #create hash table

interest_table = np.zeros((len(users),len(interest_names))) ## n x m shaped matrix, where n = users, m = interests

for user,interest in interests: ## fill interest table
    interest_table[user,interest_hash[interest]]=1 

def get_interests(user_interests, interest_names):
    '''return interest_names using masked user_interests array'''
    return [interest for flag,interest in zip(user_interests,interest_names) if flag]

def compute_interest_similarity(user,other_user):
    '''sum cross product of interest array'''
    return sum(user*other_user)

def new_friends_similar_interests(user,users):
    '''calculate similar interest between user and other users. Recommend descending sorted user ids only if they share interests'''
    return sorted((
        (other_user['id'],compute_interest_similarity(interest_table[user['id']],interest_table[other_user['id']]))
                  for other_user in users 
                  if not_the_same(user,other_user)
                  and not_friends(user,other_user)
                  and (compute_interest_similarity(interest_table[user['id']],interest_table[other_user['id']]) > 0)
                  ), key=(lambda x: x[1]), reverse=True)

new_friends_similar_interests(users[0],users)

# this solution is more efficient. indexed table is easier to work with

In [195]:
### functions by book
def data_scientists_who_like(target_interest):
    return [user_id
            for user_id, user_interest in interests 
            if user_interest == target_interest]

from collections import defaultdict

# keys are interest, values are lists of user_ids with that interest
users_ids_by_interest = defaultdict(list) # default dict creates a dictionary where the value is the indicated object type

for user_id,interest in interests:
    users_ids_by_interest[interest].append(user_id) # when a key is provided a default value is fetched. In this case is a empty list

interests_by_user_id = defaultdict(list)

for user_id,interest in interests:
    interests_by_user_id[user_id].append(interest) # when a key is provided a default value is fetched. In this case is a empty list

def most_common_interest_with(user):
    return Counter(interested_user_id # count number of times other_user_id shows
                   for interest in interests_by_user_id[user['id']] # get all interest of the user
                   for interested_user_id in users_ids_by_interest[interest] # get all users with the same interest
                   if interested_user_id!=user['id']) # verify that the original user is not counted

most_common_interest_with(users[0])

'''
This solution create a double dictionary to iterate by each other.
While index_table is a solution, adding new interest requires non-trivial operations.
This solution is simpler. when a new interest is created a new entry in users_ids_by_interest reflects
which users have that interest and for each of those users a new interest is append in interests_by_user_id.
'''

Counter({8: 1, 9: 2, 1: 2, 5: 1})