In [29]:
import config
import ssl
import nltk
nltk.download('stopwords')

from datetime import datetime, date
from sklearn import preprocessing

from pymongo import MongoClient

import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer


# Import linear kernel
from sklearn.metrics.pairwise import linear_kernel




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dehkharghanielnaz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
mongo_url = config.MONGO_URL
client = MongoClient(mongo_url, ssl_cert_reqs=ssl.CERT_NONE)
db = client['vendors_data_db']

# Three different collections (1. Users 2. Products 3. Bookmarks)
collection_users = db.users
collection_product = db.products
collection_bookmarks = db.bookmarks

In [31]:
# changing the collections to Dataframes
users_df = pd.DataFrame(list(collection_users.find()))
products_df = pd.DataFrame(list(collection_product.find()))
bookmarks_df = pd.DataFrame(list(collection_bookmarks.find()))

In [32]:
# User dataframe
users_df.head(1)


Unnamed: 0,_id,username,password,gender,phone,email,country,dateOfBirth,__v
0,613d0a82caf647c9cf94b860,Emil.Glover11,$2b$10$8q4IbzexLp9O6Sa22t1beON84IQw1Cwbkq95lPO...,Male,452.663.9160 x0505,Lydia.Dach64@gmail.com,Namibia,1984-10-19T00:50:53.006Z,0


In [33]:
# Bookmark dataframe
bookmarks_df.head(1)

Unnamed: 0,_id,userId,active,created,productId,modified
0,613dbdbfeea3b8bb491d0ec9,613d0a82caf647c9cf94b860,True,2021-08-10 00:38:38.185,613515dbae5592de90456bc4,NaT


In [34]:
# Products dataframe
products_df.head(1)
# products_df.shape #(3967, 26)
products_df = products_df.drop_duplicates(subset=['title'])

In [35]:
products_df.shape

(3733, 26)

In [36]:
users_df.columns

Index(['_id', 'username', 'password', 'gender', 'phone', 'email', 'country',
       'dateOfBirth', '__v'],
      dtype='object')

In [37]:
products_df.columns

Index(['_id', 'title', 'brand', 'price', 'quantity', 'discount', 'image',
       'vendor', 'unitPrice', 'oldPrice', 'unitPriceQuantity', 'offerDuration',
       'category', 'subcategory', 'badges', 'thumbnailImgs', 'description',
       'properties', 'ingredients', 'preparationInstruction', 'hints',
       'manufacturer', 'nutritionalValues', 'status', 'created', 'modified'],
      dtype='object')

In [38]:
bookmarks_df.columns

Index(['_id', 'userId', 'active', 'created', 'productId', 'modified'], dtype='object')

# Data Cleaning

In [39]:
# Dropping unnessary columns in users
users_info = users_df.drop(columns= ['username', 'phone' ,'email', 'password', '__v'])

In [40]:
# Renaming in user's df
users_info = users_info.rename(index=str, columns= {'_id':'user_id'})
users_info.head(1)

Unnamed: 0,user_id,gender,country,dateOfBirth
0,613d0a82caf647c9cf94b860,Male,Namibia,1984-10-19T00:50:53.006Z


In [41]:
# Convert birth date to age 
def age(born):
    born = datetime.strptime(born, "%Y-%m-%dT%H:%M:%S.%f%z").date()
    today = date.today()
    return today.year - born.year - ((today.month, 
                                      today.day) < (born.month, 
                                                    born.day))
  
users_info['age'] = users_info['dateOfBirth'].apply(age)
  
users_info.head(3)

Unnamed: 0,user_id,gender,country,dateOfBirth,age
0,613d0a82caf647c9cf94b860,Male,Namibia,1984-10-19T00:50:53.006Z,36
1,613d0a82caf647c9cf94b861,Male,Uzbekistan,1959-10-07T11:52:15.959Z,61
2,613d0a82caf647c9cf94b863,Male,Cuba,1962-04-03T12:44:20.771Z,59


In [42]:
users_info.drop(users_info[ (users_info.age <= 18)].index, axis=0, inplace=True)

In [43]:
users_info.shape

(1000, 5)

In [44]:
# Gender coding => Female: 0 and Male: 1

users_info = users_info.replace(['Female', 'Male', 'Other'], [0, 1, 2])
users_info.head(3)

Unnamed: 0,user_id,gender,country,dateOfBirth,age
0,613d0a82caf647c9cf94b860,1,Namibia,1984-10-19T00:50:53.006Z,36
1,613d0a82caf647c9cf94b861,1,Uzbekistan,1959-10-07T11:52:15.959Z,61
2,613d0a82caf647c9cf94b863,1,Cuba,1962-04-03T12:44:20.771Z,59


In [45]:
# Countries Label Encoding 

label_encoder = preprocessing.LabelEncoder()
users_info['nationality']= label_encoder.fit_transform(users_info['country']) 
users_info.head(3)

Unnamed: 0,user_id,gender,country,dateOfBirth,age,nationality
0,613d0a82caf647c9cf94b860,1,Namibia,1984-10-19T00:50:53.006Z,36,144
1,613d0a82caf647c9cf94b861,1,Uzbekistan,1959-10-07T11:52:15.959Z,61,229
2,613d0a82caf647c9cf94b863,1,Cuba,1962-04-03T12:44:20.771Z,59,52


In [46]:
# Dropping unnessary columns in products
products_info = products_df.drop(columns= ['image', 'thumbnailImgs', 'status'])


In [47]:
# Renaming in product's df

products_info = products_info.rename(index=str, columns= {'_id':'product_id'})
#products_info = products_info[products_info['subcategory'] == 'Baby- & Kindernahrung (350)']
products_info.head(3)
products_info.shape

(3733, 23)

In [48]:
# Dropping unnessary columns in bookmarks
bookmarks_info = bookmarks_df.drop(columns= ['_id','created','active', 'modified'])

In [49]:
# Renaming in bookmark's df
bookmarks_info = bookmarks_info.rename(index=str, columns= {'_id':'bookmarks_id', 'userId': 'user_id', 'productId' : 'product_id'})
bookmarks_info.head(3)

Unnamed: 0,user_id,product_id
0,613d0a82caf647c9cf94b860,613515dbae5592de90456bc4
1,613d0a82caf647c9cf94b860,613515d0ae5592de9045697e
2,613d0a82caf647c9cf94b860,613515d7ae5592de90456af7


In [50]:
# Changing the datatypes (Important for inner Join)
bookmarks_info['user_id'] = bookmarks_info['user_id'].astype(str)
users_info['user_id'] = users_info['user_id'].astype(str)

# Merging Dataframes Users and their Bookmarks

In [51]:
# Inner Join
df = pd.merge(users_info, bookmarks_info, on= 'user_id')
df.head(5)

Unnamed: 0,user_id,gender,country,dateOfBirth,age,nationality,product_id
0,613d0a82caf647c9cf94b860,1,Namibia,1984-10-19T00:50:53.006Z,36,144,613515dbae5592de90456bc4
1,613d0a82caf647c9cf94b860,1,Namibia,1984-10-19T00:50:53.006Z,36,144,613515d0ae5592de9045697e
2,613d0a82caf647c9cf94b860,1,Namibia,1984-10-19T00:50:53.006Z,36,144,613515d7ae5592de90456af7
3,613d0a82caf647c9cf94b860,1,Namibia,1984-10-19T00:50:53.006Z,36,144,613515d7ae5592de90456ae3
4,613d0a82caf647c9cf94b860,1,Namibia,1984-10-19T00:50:53.006Z,36,144,613515dfae5592de90456c59


In [52]:
# put all user bookmarks infront of him/her
df_grouped_id = df.groupby('user_id').agg({
    'gender':'first', 
    'age': 'first',
    'nationality' : 'first',
    'product_id': ', '.join, 
     }).reset_index() 

In [53]:
# Final DF for using in ML
# Here having all unique users with their personal information and bookmarked items
df_grouped_id.head(3)

Unnamed: 0,user_id,gender,age,nationality,product_id
0,613d0a82caf647c9cf94b860,1,36,144,"613515dbae5592de90456bc4, 613515d0ae5592de9045..."
1,613d0a82caf647c9cf94b861,1,61,229,"613515dcae5592de90456bea, 613515e0ae5592de9045..."
2,613d0a82caf647c9cf94b862,1,50,234,"613515dbae5592de90456bbd, 613515d8ae5592de9045..."


In [54]:
df_grouped_id.shape

(1000, 5)

In [55]:
products_info.shape

(3733, 23)

# Machine Learning:

In [56]:

products_info.head(20)['ingredients']

0     Gemüse 73% (Karotten, Kartoffeln, Erbsen), Was...
1     MILCHZUBEREITUNG (MILCH* 50%, Wasser, ENTRAHMT...
2     Gemüse* 77% (Karotten*, Tomaten*, Kartoffeln* ...
3     Birnenpüree* 64%, Bananenpüree* 28%, Orangensa...
4     Gemüse* 75% (Karotten, Kartoffeln, Erbsen, Gem...
5     Wasser, Gemüse* 37,7% (Karotten 20%, Pastinake...
6     Karotten* 40%, Wasser, Reis* gekocht 13,2%, (W...
7     WEIZENMEHL 65,5%, Zucker, Palmöl, GERSTENMALZE...
8     Früchte* 59% (säurearmer Apfelsaft* aus Apfels...
9     Gemüse* 45% (Kartoffeln* 19%, Karotten*, Kürbi...
10    Gemüse* 57% (Tomaten *28%, Karotten*, Pastinak...
11    Gemüse* 44% (Tomaten*, Karotten*, SELLERIE*, Z...
12                  Äpfel*. *aus biologischer Erzeugung
13    Früchte* 50% (Pfirsiche* 18%, Aprikosen* 12%, ...
14    Gemüse*55% (Kartoffeln* 38%, Karotten* 10%, La...
15    Zutaten: Gemüse* 67% (Kartoffeln* 32%, Karotte...
16    Zutaten: Gefriergetrocknete Mangostücke* (100%...
17    Zutaten: Pürierte Banane (43%), pürierter 

In [57]:
# Stopwords for German Language
german_stop_words = stopwords.words('german')

# Define a TF-IDF vectorizer object. Remove all german stop words.
tfidf = TfidfVectorizer(stop_words= german_stop_words)


# Replay NaN with an empty string
products_info['ingredients'] = products_info['ingredients'].fillna('')

# Construct the required TF-IDF matrix by fitting and transforming data
tfidf_matrix = tfidf.fit_transform(products_info['ingredients'])

# Output the shape of tfidf_matrix
tfidf_matrix.shape



(3733, 5247)

In [58]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [59]:
# Constract a reverse map of indices and product titles
indices = pd.Series(products_info.index, index=products_info['title']).drop_duplicates()
# print(indices.head(5))

In [60]:
# Function that takes in product title as input and outputs most similar product
def get_recommendations(title, cosine_sim = cosine_sim):
    # Get the index of the product that matches the title
    idx = indices[title]
    # print([idx])

    # Get the pairwise similarity scores of all products with that product
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    #print(sim_scores)

    # Sort the movies based on the similarity score
    sim_scores = sorted(sim_scores, key= lambda x:x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:4]

    # Get the movie indices 
    product_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return products_info['title'].iloc[product_indices]


In [61]:
 
get_recommendations('Kinderkeks 180g')

112                  Milchbrei Grieß ab dem 6.Monat 400g
150       Milchbrei Gute Nacht Grieß ab dem 4.Monat 400g
232    Milchbrei Apfel/Banane/Joghurt ab dem 6.Monat ...
Name: title, dtype: object

# Pushing the recommended items into MongoDB

In [None]:
# Pushing in MongoDB
collection = db.content_base_recommendations

for index, row in products_info.iterrows():

    recommended_products = get_recommendations(row['title'])
    p = []
    for rprod in recommended_products:
        pr = products_info[products_info['title'] == rprod]
        # print(pr)
        # print(str(pr.product_id[0]))
        p.append(str(pr.product_id[0]))
   
    mydict = {'productId': str(row['product_id']),'recommendedProducts':p}
    # print(mydict)
    collection.insert_one(mydict);

