In [28]:
import config
import ssl

from pymongo import MongoClient
from dateutil import parser

import pandas as pd
import numpy as np
from pandas import DataFrame

from datetime import datetime, date

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
# Import label encoder 
from sklearn import preprocessing


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Importing 3 Collections (Users - Bookmarks - Products)

In [29]:
mongo_url = config.MONGO_URL
client = MongoClient(mongo_url, ssl_cert_reqs=ssl.CERT_NONE)
db = client['vendors_data_db']

# Three different collections (1. Users 2. Products 3. Bookmarks)
collection_users = db.users
collection_product = db.products
collection_bookmarks = db.bookmarks

In [30]:
# changing the collections to Dataframes
users_df = pd.DataFrame(list(collection_users.find()))
products_df = pd.DataFrame(list(collection_product.find()))
bookmarks_df = pd.DataFrame(list(collection_bookmarks.find()))

In [31]:
# User dataframe
users_df.head(1)

Unnamed: 0,_id,username,password,gender,phone,email,country,dateOfBirth,__v
0,613d0a82caf647c9cf94b860,Emil.Glover11,$2b$10$8q4IbzexLp9O6Sa22t1beON84IQw1Cwbkq95lPO...,Male,452.663.9160 x0505,Lydia.Dach64@gmail.com,Namibia,1984-10-19T00:50:53.006Z,0


In [32]:
# Bookmark dataframe
bookmarks_df.head(1)

Unnamed: 0,_id,userId,active,created,productId,modified
0,613dbdbfeea3b8bb491d0ec9,613d0a82caf647c9cf94b860,True,2021-08-10 00:38:38.185,613515dbae5592de90456bc4,NaT


In [33]:
# Products dataframe
products_df.head(1)

Unnamed: 0,_id,title,brand,price,quantity,discount,image,vendor,unitPrice,oldPrice,unitPriceQuantity,offerDuration,category,subcategory,badges,thumbnailImgs,description,properties,ingredients,preparationInstruction,hints,manufacturer,nutritionalValues,status,created,modified
0,613515cdae5592de9045691b,Feines Gemüse-Allerlei 190g,BEBIVITA,0.65,190g Glas,0.0,https://media.kaufland.com/images/PPIM/AP_Prod...,Kaufland,0.65,0.0,"100 g = 0,35 €",,Baby & Kind,Baby- & Kindernahrung (350),,[https://media.kaufland.com/images/PPIM/AP_Pro...,"Beste Zutaten aus der Natur - Karotten, Kartof...",Altersempfehlung:ab 6. MonatFüllmenge:190.0Fre...,"Gemüse 73% (Karotten, Kartoffeln, Erbsen), Was...",,Bei Flüssigkeitsabsatz bitte umrühren. Benötig...,Bebivita GmbH,,True,1630594239201,2021-09-03 18:00:51.834


In [34]:
users_df.columns

Index(['_id', 'username', 'password', 'gender', 'phone', 'email', 'country',
       'dateOfBirth', '__v'],
      dtype='object')

In [35]:
products_df.columns

Index(['_id', 'title', 'brand', 'price', 'quantity', 'discount', 'image',
       'vendor', 'unitPrice', 'oldPrice', 'unitPriceQuantity', 'offerDuration',
       'category', 'subcategory', 'badges', 'thumbnailImgs', 'description',
       'properties', 'ingredients', 'preparationInstruction', 'hints',
       'manufacturer', 'nutritionalValues', 'status', 'created', 'modified'],
      dtype='object')

In [36]:
bookmarks_df.columns

Index(['_id', 'userId', 'active', 'created', 'productId', 'modified'], dtype='object')

# Data Cleaning and Data Preprocessing

In [37]:
# Dropping unnessary columns in users
users_info = users_df.drop(columns= ['username', 'phone' ,'email', 'password', '__v'])

In [38]:
# Renaming in user's df
users_info = users_info.rename(index=str, columns= {'_id':'user_id'})
users_info.head(1)

Unnamed: 0,user_id,gender,country,dateOfBirth
0,613d0a82caf647c9cf94b860,Male,Namibia,1984-10-19T00:50:53.006Z


In [39]:
# Convert birth date to age 
def age(born):
    born = datetime.strptime(born, "%Y-%m-%dT%H:%M:%S.%f%z").date()
    today = date.today()
    return today.year - born.year - ((today.month, 
                                      today.day) < (born.month, 
                                                    born.day))
  
users_info['age'] = users_info['dateOfBirth'].apply(age)
  
users_info.head(10)

Unnamed: 0,user_id,gender,country,dateOfBirth,age
0,613d0a82caf647c9cf94b860,Male,Namibia,1984-10-19T00:50:53.006Z,36
1,613d0a82caf647c9cf94b861,Male,Uzbekistan,1959-10-07T11:52:15.959Z,61
2,613d0a82caf647c9cf94b863,Male,Cuba,1962-04-03T12:44:20.771Z,59
3,613d0a82caf647c9cf94b862,Male,"Virgin Islands, U.S.",1971-08-02T13:10:46.242Z,50
4,613d0a82caf647c9cf94b864,Other,Niue,1996-04-04T15:23:29.943Z,25
5,613d0a82caf647c9cf94b866,Female,United States Minor Outlying Islands,1971-05-16T04:19:52.214Z,50
6,613d0a82caf647c9cf94b865,Male,Saint Vincent and the Grenadines,1968-05-12T08:46:07.223Z,53
7,613d0a82caf647c9cf94b867,Female,Denmark,1971-08-08T10:18:53.031Z,50
8,613d0a82caf647c9cf94b868,Male,Tanzania,1966-07-05T12:23:02.330Z,55
9,613d0a82caf647c9cf94b86a,Male,Saint Martin,1995-11-03T18:15:09.392Z,25


In [40]:
# filter the ages above the age of 18
users_info.drop(users_info[ (users_info.age <= 18)].index, axis=0, inplace=True)

In [41]:
users_info.head(3)

Unnamed: 0,user_id,gender,country,dateOfBirth,age
0,613d0a82caf647c9cf94b860,Male,Namibia,1984-10-19T00:50:53.006Z,36
1,613d0a82caf647c9cf94b861,Male,Uzbekistan,1959-10-07T11:52:15.959Z,61
2,613d0a82caf647c9cf94b863,Male,Cuba,1962-04-03T12:44:20.771Z,59


In [42]:
# Gender coding => Female: 0 and Male: 1

users_info = users_info.replace(['Female', 'Male', 'Other'], [0, 1, 2])
users_info.head(3)

Unnamed: 0,user_id,gender,country,dateOfBirth,age
0,613d0a82caf647c9cf94b860,1,Namibia,1984-10-19T00:50:53.006Z,36
1,613d0a82caf647c9cf94b861,1,Uzbekistan,1959-10-07T11:52:15.959Z,61
2,613d0a82caf647c9cf94b863,1,Cuba,1962-04-03T12:44:20.771Z,59


In [43]:
# Countries Label Encoding 

label_encoder = preprocessing.LabelEncoder()
users_info['nationality']= label_encoder.fit_transform(users_info['country']) 
users_info.head(3)

Unnamed: 0,user_id,gender,country,dateOfBirth,age,nationality
0,613d0a82caf647c9cf94b860,1,Namibia,1984-10-19T00:50:53.006Z,36,144
1,613d0a82caf647c9cf94b861,1,Uzbekistan,1959-10-07T11:52:15.959Z,61,229
2,613d0a82caf647c9cf94b863,1,Cuba,1962-04-03T12:44:20.771Z,59,52


In [44]:
# Dropping unnessary columns in products
products_info = products_df.drop(columns= ['image', 'thumbnailImgs', 'status'])


In [45]:
# Renaming in product's df

products_info = products_info.rename(index=str, columns= {'_id':'product_id'})
products_info.head(1)

Unnamed: 0,product_id,title,brand,price,quantity,discount,vendor,unitPrice,oldPrice,unitPriceQuantity,offerDuration,category,subcategory,badges,description,properties,ingredients,preparationInstruction,hints,manufacturer,nutritionalValues,created,modified
0,613515cdae5592de9045691b,Feines Gemüse-Allerlei 190g,BEBIVITA,0.65,190g Glas,0.0,Kaufland,0.65,0.0,"100 g = 0,35 €",,Baby & Kind,Baby- & Kindernahrung (350),,"Beste Zutaten aus der Natur - Karotten, Kartof...",Altersempfehlung:ab 6. MonatFüllmenge:190.0Fre...,"Gemüse 73% (Karotten, Kartoffeln, Erbsen), Was...",,Bei Flüssigkeitsabsatz bitte umrühren. Benötig...,Bebivita GmbH,,1630594239201,2021-09-03 18:00:51.834


In [46]:
# Dropping unnessary columns in bookmarks

bookmarks_info = bookmarks_df.drop(columns= ['_id', 'created', 'active'])
# Index(['_id', 'userId', 'active', 'created', 'productId'], dtype='object')

In [47]:
# Renaming in bookmark's df
bookmarks_info = bookmarks_info.rename(index=str, columns= {'userId': 'user_id', 'productId' : 'product_id'})
bookmarks_info.head(3)

Unnamed: 0,user_id,product_id,modified
0,613d0a82caf647c9cf94b860,613515dbae5592de90456bc4,NaT
1,613d0a82caf647c9cf94b860,613515d0ae5592de9045697e,NaT
2,613d0a82caf647c9cf94b860,613515d7ae5592de90456af7,NaT


In [48]:
# Changing the datatypes (Important for inner Join)
bookmarks_info['user_id'] = bookmarks_info['user_id'].astype(str)
users_info['user_id'] = users_info['user_id'].astype(str)



# Complete DataFrame (Inner Join):

In [49]:
# Inner Join
df = pd.merge(users_info, bookmarks_info, on= 'user_id')
df.head(3)

Unnamed: 0,user_id,gender,country,dateOfBirth,age,nationality,product_id,modified
0,613d0a82caf647c9cf94b860,1,Namibia,1984-10-19T00:50:53.006Z,36,144,613515dbae5592de90456bc4,NaT
1,613d0a82caf647c9cf94b860,1,Namibia,1984-10-19T00:50:53.006Z,36,144,613515d0ae5592de9045697e,NaT
2,613d0a82caf647c9cf94b860,1,Namibia,1984-10-19T00:50:53.006Z,36,144,613515d7ae5592de90456af7,NaT


In [50]:
# Counting the number of unique users
len(df.user_id.unique())

1000

In [51]:
# Merging back the bookmarked products id to each specific users

df_grouped_id = df.groupby('user_id').agg({
    'gender':'first', 
    'age': 'first',
    'nationality' : 'first',
    'product_id': ', '.join, 
     }).reset_index()           

In [52]:
df_grouped_id.shape

(1000, 5)

In [53]:
# Final DF for using in ML
# Here having all unique users with their personal information and bookmarked items
df_grouped_id.head(3)

Unnamed: 0,user_id,gender,age,nationality,product_id
0,613d0a82caf647c9cf94b860,1,36,144,"613515dbae5592de90456bc4, 613515d0ae5592de9045..."
1,613d0a82caf647c9cf94b861,1,61,229,"613515dcae5592de90456bea, 613515e0ae5592de9045..."
2,613d0a82caf647c9cf94b862,1,50,234,"613515dbae5592de90456bbd, 613515d8ae5592de9045..."


In [54]:
df_grouped_id['product_id'][0]

'613515dbae5592de90456bc4, 613515d0ae5592de9045697e, 613515d7ae5592de90456af7, 613515d7ae5592de90456ae3, 613515dfae5592de90456c59, 613515ddae5592de90456c28, 613515deae5592de90456c54, 613515dcae5592de90456bd9, 613515e2ae5592de90456d21, 613515dbae5592de90456bd0, 613515dfae5592de90456c81, 613515d0ae5592de9045699f, 613515cdae5592de9045694e, 613515e2ae5592de90456d02, 613515dcae5592de90456bf6, 613515ceae5592de9045695c, 613515d7ae5592de90456af9, 613515d2ae5592de90456a02, 613515dfae5592de90456c65, 613515dfae5592de90456c79, 613515dfae5592de90456c8a, 613515d1ae5592de904569cb, 613515d5ae5592de90456a91, 613515e4ae5592de90456d7a, 613515d3ae5592de90456a30, 613515d5ae5592de90456a68, 613515d8ae5592de90456b01, 613515d3ae5592de90456a1c, 613515d5ae5592de90456a8f, 613515cdae5592de90456929'

# Machine Learning - Decision Tree

In [None]:
# Spliting the dataset -> Training and Testing

# X = Predictors (gender, nationality, age)
X = df_grouped_id.drop(columns=['user_id', 'product_id' ])
y = df_grouped_id['product_id']
# y = y.to_list()
y = y.astype('str')



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)


model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
predictions


In [56]:

# score = accuracy_score(y_test, predictions)
# score

In [57]:
# cm = confusion_matrix(y_test, predictions)

In [58]:
# cm

In [59]:

# def accuracy(confusion_matrix):
#     diagonal_sum = confusion_matrix.trace()
#     sum_of_all_elements = confusion_matrix.sum()
#     return diagonal_sum / sum_of_all_elements 

In [60]:
#accuracy(cm)


In [61]:
joblib.dump(model, 'V3_product-recommender.joblib')


['V3_product-recommender.joblib']

In [None]:
model_saved = joblib.load('V3_product-recommender.joblib')

predictions = model_saved.predict(X_test)
predictions

In [63]:
# Creat tree
tree.export_graphviz(model, 
out_file='V3_product-recommender.dot', 
feature_names=['gender', 'age', 'nationality'], 
class_names = sorted(y.unique()), 
label='all', 
rounded = True, 
filled= True)