# Connected Recommenders

In [1]:
import pandas as pd
import pymongo

def _connect_mongo():
    """ Connects to Mongo """
    global conn
    try:
        #use your database name, user and password here:
        #mongodb://<dbuser>:<dbpassword>@<mlab_url>.mlab.com:57066/<database_name>
        with open("credentials.txt", 'r') as f:
            [name,password,url,dbname]=f.read().splitlines()
        conn=pymongo.MongoClient("mongodb://{}:{}@{}/{}".format(name,password,url,dbname))
        
        print ("Connected successfully to", dbname)

    except pymongo.errors.ConnectionFailure as e:
        print ("Could not connect to MongoDB: %s" % e) 
        
    print(conn)
    db=conn["agile_data_science_group_3"]
    
    return db


def read_collection_as_df(collection_name, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo()
    
    collection = db[collection_name]
    
    # Make a query to the specific DB and Collection
    cursor = collection.find()

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor)) 
    
   # for col in df.columns.values:
    #    df[col] = df[col].values.astype(str)

    conn.close()
    
    return df

def read_collection_as_dict(collection_name):
    """ Read from Mongo and Store into dict """

    # Connect to MongoDB
    db = _connect_mongo()
    
    collection = db[collection_name]

    l=list()
    for d in collection.find():
        l.append(d)
      
    dic=dict()
    for d in l:
        for key, value in d.items():
            if key!='_id':
                dic[key] = value
    
    conn.close()
    
    return dic


def read_collection_objid_as_dict(collection_name):
    """ 
    Read from Mongo and Store into dict 
    Collection format: {'_id': ObjectId('5a29a690166d3310bc82f132'), 'recepie_id' : ObjectId ('....), 'ingridients':....}
    """

    # Connect to MongoDB
    db = _connect_mongo()
    
    collection = db[collection_name]

    dic = {d["recipe_id"]:d['ingredients'] for d in collection.find()}
      
    conn.close()
    
    return dic

### Search Engine

In [2]:
import operator
def search_engine(recipes_input, recipes_dict=read_collection_objid_as_dict("RecIng")):
    """
    Given a list of ingredients from user and the "recipes: [ingredients]" dictionary, order the recipes by a score of suitability.
    It also provides those matching ingredients and missing ingredients for each recommendation. 
    
    Input: 
    - user_input as list. Example:["ingedient1", "ingredient2", ...]
    - recipes_dict as dictionary. Example: "{recipe1: [ingredients], recipe2: [ingredients] recipe3: [ingredients], ...}"
    
    *recipes_dict can be read from Mongo collection: recipes_dict = read_collection_objid_as_dict(recipes_ingredients)
    
    The output are: 
    - recomendations as list. Example: [recipe134, recipe43234, recipe544, ...]
    - missings as list of sets. Example: [{}, {ingredient43}, {ingredient32, ingredient45}, ...]
    """
    
    matching = dict()
    missing = dict()
    score = dict()
    
    user_ingredients = set(recipes_input)
    
    for recipe in recipes_dict.keys():
        
        recipe_ingredients = set(recipes_dict[recipe])   # ingredients of recipe
        
        matching[recipe] = user_ingredients.intersection(recipe_ingredients) # ingredients in user_input & in recipe  
        missing[recipe] = recipe_ingredients.difference(user_ingredients)  # ingredients of recipe that user_input doesnt have
        score[recipe] = float(len(matching[recipe]))/(len(missing[recipe])+1)
    
    recommendations_sorted = sorted(score.items(), key=operator.itemgetter(1), reverse=True)
    recommendations_sorted = [recipe_score[0] for recipe_score in recommendations_sorted]
    missing_sorted = [missing[rec] for rec in recommendations_sorted]
    
    return recommendations_sorted, missing_sorted

('Connected successfully to', 'agile_data_science_group_3')
MongoClient(host=['ds233895.mlab.com:33895'], document_class=dict, tz_aware=False, connect=True)


In [3]:
grocery = ["egg", "chicken"]

search_list, missings_ingredients = search_engine(grocery)

In [4]:
search_list

[ObjectId('59fb0d77eee3642f3c068478'),
 ObjectId('59fb0d77eee3642f3c06881b'),
 ObjectId('59fb0d77eee3642f3c0689cb'),
 ObjectId('59fb0d77eee3642f3c0685e5'),
 ObjectId('59fb0d77eee3642f3c06855c'),
 ObjectId('59fb0d77eee3642f3c068628'),
 ObjectId('59fb0d77eee3642f3c0688f0'),
 ObjectId('59fb0d77eee3642f3c06891d'),
 ObjectId('59fb0d77eee3642f3c068b78'),
 ObjectId('59fb0d77eee3642f3c0685d8'),
 ObjectId('59fb0d77eee3642f3c068464'),
 ObjectId('59fb0d77eee3642f3c0685d3'),
 ObjectId('59fb0d77eee3642f3c0686b1'),
 ObjectId('59fb0d77eee3642f3c068745'),
 ObjectId('59fb0d77eee3642f3c068862'),
 ObjectId('59fb0d77eee3642f3c068a9e'),
 ObjectId('59fb0d77eee3642f3c068c1a'),
 ObjectId('59fb0d77eee3642f3c068d9a'),
 ObjectId('59fb0d77eee3642f3c068d69'),
 ObjectId('59fb0d77eee3642f3c068470'),
 ObjectId('59fb0d77eee3642f3c068586'),
 ObjectId('59fb0d77eee3642f3c06891c'),
 ObjectId('59fb0d77eee3642f3c068a2f'),
 ObjectId('59fb0d77eee3642f3c068cfd'),
 ObjectId('59fb0d77eee3642f3c06877f'),
 ObjectId('59fb0d77eee364

In [5]:
missings_ingredients

[{u'sugar'},
 {u'sugar'},
 {u'butter', u'potatoe', u'stock'},
 {u'breadcrumb',
  u'breast',
  u'butter',
  u'fillet',
  u'flour',
  u'lemon',
  u'parmesan',
  u'sprig',
  u'thyme'},
 {u'cauliflower', u'floret', u'flour', u'milk', u'oil'},
 {u'chocolate', u'cream', u'dark', u'free-range', u'yolk'},
 {u'flour', u'frozen', u'juice', u'roast', u'stock'},
 {u'cherrie', u'cream', u'double', u'strawberrie', u'sugar'},
 {u'bread', u'butter', u'milk', u'serve', u'wholegrain'},
 {u'bag',
  u'bistro',
  u'case',
  u'cream',
  u'double',
  u'leftover',
  u'pastry',
  u'potato',
  u'salad',
  u'savoury',
  u'sweetcorn',
  u'wedge'},
 {u'banana', u'blueberrie', u'pack', u'raspberrie', u'ripe', u'strawberrie'},
 {u'asda',
  u'bag',
  u'dry',
  u'grower\u2019',
  u'hoisin',
  u'noodle',
  u'oil',
  u'pack',
  u'sauce',
  u'stir-fry',
  u'sunday',
  u'vegetable',
  u'weight'},
 {u'courgette', u'flour', u'lengthwise', u'milk', u'polenta', u'stick'},
 {u'flour', u'free-range', u'milk', u'oil', u'pinch', 

In [6]:
map(len, missings_ingredients)

[1,
 1,
 3,
 9,
 5,
 5,
 5,
 5,
 5,
 12,
 6,
 13,
 6,
 6,
 6,
 6,
 6,
 13,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 16,
 16,
 16,
 8,
 17,
 17,
 17,
 8,
 8,
 8,
 8,
 8,
 17,
 8,
 8,
 8,
 8,
 8,
 18,
 9,
 9,
 9,
 19,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 20,
 10,
 10,
 10,
 10,
 21,
 10,
 10,
 10,
 21,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 22,
 22,
 22,
 22,
 22,
 22,
 11,
 11,
 11,
 11,
 23,
 11,
 11,
 23,
 23,
 11,
 11,
 11,
 11,
 11,
 11,
 23,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 24,
 24,
 24,
 24,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 25,
 25,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 26,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 27,
 13,
 27,
 13,
 27,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 1

### Best Rated

In [7]:
import numpy as np
import pandas as pd
import csv
import bson
from bson.objectid import ObjectId

In [8]:
def best_rated(data=read_collection_as_df("ratings"), n=10):
    """
    Return top n recipes by maximum mean rating. In case of draw, then by minimum standard deviation rating.
    """
    # top rated
    data["rating"] = data["rating"].astype(float)
    recipe_rating_mean = data.groupby(['recipe_id'])['rating'].mean()
    recipe_rating_std = data.groupby(['recipe_id'])['rating'].std()
    recipe_rating = pd.concat([recipe_rating_mean, recipe_rating_std], axis=1)
    recipe_rating.columns=["mean", "std"]
    recs = recipe_rating.sort(["mean", "std"], ascending=[0,1])
    
    return list(map(ObjectId,list(recs.index.values)[:n]))

('Connected successfully to', 'agile_data_science_group_3')
MongoClient(host=['ds233895.mlab.com:33895'], document_class=dict, tz_aware=False, connect=True)


In [9]:
best_rated()



[ObjectId('5a073c312cfaa84d14993958'),
 ObjectId('5a073c312cfaa84d14993a58'),
 ObjectId('5a073c312cfaa84d14994d7d'),
 ObjectId('5a073c312cfaa84d14994fa8'),
 ObjectId('5a073c312cfaa84d149951c8'),
 ObjectId('5a073c312cfaa84d14995366'),
 ObjectId('5a073c312cfaa84d14995758'),
 ObjectId('5a073c312cfaa84d14996063'),
 ObjectId('59fb0d77eee3642f3c068685'),
 ObjectId('5a073c312cfaa84d14993933')]

### Collaborative filtering

Collaborative filtering uses the graphlab library because is faster than any manual implementation. It uses internal datastructures based in order to avoid sparse matrices and therefore provide fast recommendations. 

This library works on Python 2.7, create a new environment in case you are working in other version.

In [None]:
conda create -n gl-env python=2.7 anaconda=4.0.0
source activate gl-env

You can install this package using my account. Just need the following pip install:

In [None]:
pip install --upgrade --no-cache-dir https://get.graphlab.com/GraphLab-Create/2.1/gonzaloespinosaduelo@gmail.com/47BC-178E-7242-F049-6C93-DD34-BC02-7E32/GraphLab-Create-License.tar.gz

In [10]:
import numpy as np
import pandas as pd
import csv
import graphlab as gl
import bson
from bson.objectid import ObjectId

In [11]:
def collaborative_filtering(user_id, data=read_collection_as_df("ratings"), n=10):
    """
    Collaborative filtering using item pearson correlation.
    
    data 
    """
    
    # select only user, recipe, ratings (what we need)
    data["rating"] = data["rating"].astype(float)
    data["user_name"] = data["user_id"].map(str)
    data["recipe_name"] = data["recipe_id"].map(str)

    df = data[["user_name","recipe_name", "rating"]]    
    
    sf = gl.SFrame(df)
                                       
    collaborative_filtering = gl.item_similarity_recommender.create(sf, 
                                                                    user_id='user_name', 
                                                                    item_id='recipe_name', 
                                                                    target='rating', 
                                                                    similarity_type='pearson', verbose=False)
    
    u = [str(user_id)]
    recs_itemitem_pearson = collaborative_filtering.recommend(users=u, k=n)     # top 10 collaborative filtering recomendations
    
    recs = [x for x in list(recs_itemitem_pearson["recipe_name"]) if str(x) != 'nan']
    
    return list(map(ObjectId, recs))

('Connected successfully to', 'agile_data_science_group_3')
MongoClient(host=['ds233895.mlab.com:33895'], document_class=dict, tz_aware=False, connect=True)


In [12]:
recs = collaborative_filtering(user_id = ObjectId("5a1aa2cb2cfaa80d6ff251f7"), n=10)
print(recs)

This non-commercial license of GraphLab Create for academic use is assigned to gonzaloespinosaduelo@gmail.com and will expire on November 08, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1513177137.log


[ObjectId('5a073c312cfaa84d14996063'), ObjectId('5a073c312cfaa84d14994d7d'), ObjectId('5a073c312cfaa84d14993958'), ObjectId('5a073c312cfaa84d1499560f'), ObjectId('5a073c312cfaa84d14996a77'), ObjectId('5a073c312cfaa84d14995758'), ObjectId('5a073c312cfaa84d14996080'), ObjectId('5a073c312cfaa84d149953f0'), ObjectId('5a073c312cfaa84d14995366')]
