In [3]:
import config
import ssl
import nltk
nltk.download('stopwords')


from pymongo import MongoClient

import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer


# Import linear kernel
from sklearn.metrics.pairwise import linear_kernel




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dehkharghanielnaz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
mongo_url = config.MONGO_URL
client = MongoClient(mongo_url, ssl_cert_reqs=ssl.CERT_NONE)
db = client['vendors_data_db']

# Three different collections (1. Users 2. Products 3. Bookmarks)
collection_users = db.users_col
collection_product = db.products
collection_bookmarks = db.bookmarks_col

In [5]:
# changing the collections to Dataframes
users_df = pd.DataFrame(list(collection_users.find()))
products_df = pd.DataFrame(list(collection_product.find()))
bookmarks_df = pd.DataFrame(list(collection_bookmarks.find()))

In [6]:
# User dataframe
users_df.head(1)


Unnamed: 0,_id,firstName,lastName,gender,phone,email,password,countryOfOrigin,dateOfBirth
0,6133af8bc4a7e1daa5fd78e0,Lisa,Roach,Female,700.110.0724x387,Lisa.Roach@gmail.com,$2b$10$UXmmmXeFmsH36HwkwEEsFeCgTAdMlrcY2o4PeNK...,Georgia,1995-04-26


In [7]:
# Bookmark dataframe
bookmarks_df.head(1)

Unnamed: 0,_id,userId,active,created,productId,modified
0,6135b4cbf810533516c4639f,6133af8dc4a7e1daa5fd7923,True,2021-09-06 08:27:23,6132464b75e744d165bf2b08,


In [8]:
# Products dataframe
products_df.head(1)
# products_df.shape #(3967, 26)
products_df = products_df.drop_duplicates(subset=['title'])

In [9]:
users_df.columns

Index(['_id', 'firstName', 'lastName', 'gender', 'phone', 'email', 'password',
       'countryOfOrigin', 'dateOfBirth'],
      dtype='object')

In [10]:
products_df.columns

Index(['_id', 'title', 'brand', 'price', 'quantity', 'discount', 'image',
       'vendor', 'unitPrice', 'oldPrice', 'unitPriceQuantity', 'offerDuration',
       'category', 'subcategory', 'badges', 'thumbnailImgs', 'description',
       'properties', 'ingredients', 'preparationInstruction', 'hints',
       'manufacturer', 'nutritionalValues', 'status', 'created', 'modified'],
      dtype='object')

In [11]:
bookmarks_df.columns

Index(['_id', 'userId', 'active', 'created', 'productId', 'modified'], dtype='object')

# Data Cleaning

In [12]:
# Dropping unnessary columns in users
users_info = users_df.drop(columns= ['firstName', 'lastName', 'phone' ,'email', 'password'])

In [13]:
# Renaming in user's df
users_info = users_info.rename(index=str, columns= {'_id':'user_id'})
users_info.head(1)

Unnamed: 0,user_id,gender,countryOfOrigin,dateOfBirth
0,6133af8bc4a7e1daa5fd78e0,Female,Georgia,1995-04-26


In [14]:
# Dropping unnessary columns in products
products_info = products_df.drop(columns= ['image', 'thumbnailImgs', 'status'])


In [15]:
# Renaming in product's df

products_info = products_info.rename(index=str, columns= {'_id':'product_id'})
#products_info = products_info[products_info['subcategory'] == 'Baby- & Kindernahrung (350)']
products_info.head(3)
products_info.shape

(3733, 23)

In [16]:
# Dropping unnessary columns in bookmarks
bookmarks_info = bookmarks_df.drop(columns= ['_id','created', 'modified','active'])

In [17]:
# Renaming in bookmark's df
bookmarks_info = bookmarks_info.rename(index=str, columns= {'_id':'bookmarks_id', 'userId': 'user_id', 'productId' : 'product_id'})
bookmarks_info.head(3)

Unnamed: 0,user_id,product_id
0,6133af8dc4a7e1daa5fd7923,6132464b75e744d165bf2b08
1,6133af8dc4a7e1daa5fd7928,6132465275e744d165bf2ca1
2,6133af8ec4a7e1daa5fd794b,6132466275e744d165bf2faa


In [18]:
# Changing the datatypes (Important for inner Join)
bookmarks_info['user_id'] = bookmarks_info['user_id'].astype(str)
users_info['user_id'] = users_info['user_id'].astype(str)

# Merging Dataframes Users and their Bookmarks

In [19]:
# Inner Join
df = pd.merge(users_info, bookmarks_info, on= 'user_id')
df.head(7)

Unnamed: 0,user_id,gender,countryOfOrigin,dateOfBirth,product_id
0,6133af8bc4a7e1daa5fd78e1,Female,Russian Federation,2019-03-19,6132462875e744d165bf245d
1,6133af8bc4a7e1daa5fd78e2,Male,Greece,2001-03-15,6132462d75e744d165bf255f
2,6133af8bc4a7e1daa5fd78e2,Male,Greece,2001-03-15,6132462375e744d165bf236c
3,6133af8bc4a7e1daa5fd78e2,Male,Greece,2001-03-15,6132466975e744d165bf30ef
4,6133af8bc4a7e1daa5fd78e2,Male,Greece,2001-03-15,6132465f75e744d165bf2f25
5,6133af8bc4a7e1daa5fd78e2,Male,Greece,2001-03-15,6132462c75e744d165bf251e
6,6133af8bc4a7e1daa5fd78e3,Male,French Guiana,1994-04-29,6132463275e744d165bf264f


In [20]:
# put all user bookmarks infront of him/her
df_grouped_id = df.groupby('user_id').agg({
    'product_id': ', '.join, 
     }).reset_index() 

In [21]:
# Final DF for using in ML
# Here having all unique users with their personal information and bookmarked items
df_grouped_id.head(3)

Unnamed: 0,user_id,product_id
0,6133af8bc4a7e1daa5fd78e1,6132462875e744d165bf245d
1,6133af8bc4a7e1daa5fd78e2,"6132462d75e744d165bf255f, 6132462375e744d165bf..."
2,6133af8bc4a7e1daa5fd78e3,"6132463275e744d165bf264f, 6132464c75e744d165bf..."


In [None]:
products_info.shape

# Machine Learning:

In [22]:

products_info.head(20)['ingredients']

0     Gemüse 73% (Karotten, Kartoffeln, Erbsen), Was...
1     MILCHZUBEREITUNG (MILCH* 50%, Wasser, ENTRAHMT...
2     Gemüse* 77% (Karotten*, Tomaten*, Kartoffeln* ...
3     Birnenpüree* 64%, Bananenpüree* 28%, Orangensa...
4     Gemüse* 75% (Karotten, Kartoffeln, Erbsen, Gem...
5     Wasser, Gemüse* 37,7% (Karotten 20%, Pastinake...
6     Karotten* 40%, Wasser, Reis* gekocht 13,2%, (W...
7     WEIZENMEHL 65,5%, Zucker, Palmöl, GERSTENMALZE...
8     Früchte* 59% (säurearmer Apfelsaft* aus Apfels...
9     Gemüse* 45% (Kartoffeln* 19%, Karotten*, Kürbi...
10    Gemüse* 57% (Tomaten *28%, Karotten*, Pastinak...
11    Gemüse* 44% (Tomaten*, Karotten*, SELLERIE*, Z...
12                  Äpfel*. *aus biologischer Erzeugung
13    Früchte* 50% (Pfirsiche* 18%, Aprikosen* 12%, ...
14    Gemüse*55% (Kartoffeln* 38%, Karotten* 10%, La...
15    Zutaten: Gemüse* 67% (Kartoffeln* 32%, Karotte...
16    Zutaten: Gefriergetrocknete Mangostücke* (100%...
17    Zutaten: Pürierte Banane (43%), pürierter 

In [23]:
# Stopwords for German Language
german_stop_words = stopwords.words('german')

# Define a TF-IDF vectorizer object. Remove all german stop words.
tfidf = TfidfVectorizer(stop_words= german_stop_words)


# Replay NaN with an empty string
products_info['ingredients'] = products_info['ingredients'].fillna('')

# Construct the required TF-IDF matrix by fitting and transforming data
tfidf_matrix = tfidf.fit_transform(products_info['ingredients'])

# Output the shape of tfidf_matrix
tfidf_matrix.shape



(3733, 5247)

In [24]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [25]:
# Constract a reverse map of indices and product titles
indices = pd.Series(products_info.index, index=products_info['title']).drop_duplicates()
# print(indices.head(5))

In [26]:
# Function that takes in product title as input and outputs most similar product
def get_recommendations(title, cosine_sim = cosine_sim):
    # Get the index of the product that matches the title
    idx = indices[title]
    # print([idx])

    # Get the pairwise similarity scores of all products with that product
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    #print(sim_scores)

    # Sort the movies based on the similarity score
    sim_scores = sorted(sim_scores, key= lambda x:x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:4]

    # Get the movie indices 
    product_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return products_info['title'].iloc[product_indices]


In [27]:
 
get_recommendations('Kinderkeks 180g')

112                  Milchbrei Grieß ab dem 6.Monat 400g
150       Milchbrei Gute Nacht Grieß ab dem 4.Monat 400g
232    Milchbrei Apfel/Banane/Joghurt ab dem 6.Monat ...
Name: title, dtype: object

# Pushing the recommended items into MongoDB

In [None]:
collection = db.content_base_recommendations

for index, row in products_info.iterrows():

    recommended_products = get_recommendations(row['title'])
    p = []
    for rprod in recommended_products:
        pr = products_info[products_info['title'] == rprod]
        # print(pr)
        # print(str(pr.product_id[0]))
        p.append(str(pr.product_id[0]))
   
    mydict = {'productId': str(row['product_id']),'recommendedProducts':p}
    collection.insert_one(mydict);
