In [350]:
import config
import ssl

from pymongo import MongoClient

import pandas as pd
import numpy as np
from pandas import DataFrame

from datetime import datetime, date

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
# Import label encoder 
from sklearn import preprocessing


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Importing 3 Collections (Users - Bookmarks - Products)

In [351]:
mongo_url = config.MONGO_URL
client = MongoClient(mongo_url, ssl_cert_reqs=ssl.CERT_NONE)
db = client['vendors_data_db']

# Three different collections (1. Users 2. Products 3. Bookmarks)
collection_users = db.users_col
collection_product = db.products
collection_bookmarks = db.bookmarks_col

In [352]:
# changing the collections to Dataframes
users_df = pd.DataFrame(list(collection_users.find()))
products_df = pd.DataFrame(list(collection_product.find()))
bookmarks_df = pd.DataFrame(list(collection_bookmarks.find()))

In [353]:
# User dataframe
users_df.head(1)

Unnamed: 0,_id,firstName,lastName,gender,phone,email,password,countryOfOrigin,dateOfBirth
0,6133af8bc4a7e1daa5fd78e0,Lisa,Roach,Female,700.110.0724x387,Lisa.Roach@gmail.com,$2b$10$UXmmmXeFmsH36HwkwEEsFeCgTAdMlrcY2o4PeNK...,Georgia,1995-04-26


In [354]:
# Bookmark dataframe
bookmarks_df.head(1)

Unnamed: 0,_id,userId,active,created,productId,modified
0,6135b4cbf810533516c4639f,6133af8dc4a7e1daa5fd7923,True,2021-09-06 08:27:23,6132464b75e744d165bf2b08,


In [355]:
# Products dataframe
products_df.head(1)

Unnamed: 0,_id,title,brand,price,quantity,discount,image,vendor,unitPrice,oldPrice,unitPriceQuantity,offerDuration,category,subcategory,badges,thumbnailImgs,description,properties,ingredients,preparationInstruction,hints,manufacturer,nutritionalValues,status,created,modified
0,613515cdae5592de9045691b,Feines Gemüse-Allerlei 190g,BEBIVITA,0.65,190g Glas,0.0,https://media.kaufland.com/images/PPIM/AP_Prod...,Kaufland,0.65,0.0,"100 g = 0,35 €",,Baby & Kind,Baby- & Kindernahrung (350),,[https://media.kaufland.com/images/PPIM/AP_Pro...,"Beste Zutaten aus der Natur - Karotten, Kartof...",Altersempfehlung:ab 6. MonatFüllmenge:190.0Fre...,"Gemüse 73% (Karotten, Kartoffeln, Erbsen), Was...",,Bei Flüssigkeitsabsatz bitte umrühren. Benötig...,Bebivita GmbH,,True,1630594239201,2021-09-03 18:00:51.834


In [356]:
users_df.columns

Index(['_id', 'firstName', 'lastName', 'gender', 'phone', 'email', 'password',
       'countryOfOrigin', 'dateOfBirth'],
      dtype='object')

In [357]:
products_df.columns

Index(['_id', 'title', 'brand', 'price', 'quantity', 'discount', 'image',
       'vendor', 'unitPrice', 'oldPrice', 'unitPriceQuantity', 'offerDuration',
       'category', 'subcategory', 'badges', 'thumbnailImgs', 'description',
       'properties', 'ingredients', 'preparationInstruction', 'hints',
       'manufacturer', 'nutritionalValues', 'status', 'created', 'modified'],
      dtype='object')

In [358]:
bookmarks_df.columns

Index(['_id', 'userId', 'active', 'created', 'productId', 'modified'], dtype='object')

# Data Cleaning and Data Preprocessing

In [359]:
# Dropping unnessary columns in users
users_info = users_df.drop(columns= ['firstName', 'lastName', 'phone' ,'email', 'password'])

In [360]:
# Renaming in user's df
users_info = users_info.rename(index=str, columns= {'_id':'user_id'})
users_info.head(1)

Unnamed: 0,user_id,gender,countryOfOrigin,dateOfBirth
0,6133af8bc4a7e1daa5fd78e0,Female,Georgia,1995-04-26


In [361]:
# Convert birth date to age 
def age(born):
    born = datetime.strptime(born, "%Y-%m-%d").date()
    today = date.today()
    return today.year - born.year - ((today.month, 
                                      today.day) < (born.month, 
                                                    born.day))
  
users_info['age'] = users_info['dateOfBirth'].apply(age)
  
users_info.head(10)

Unnamed: 0,user_id,gender,countryOfOrigin,dateOfBirth,age
0,6133af8bc4a7e1daa5fd78e0,Female,Georgia,1995-04-26,26
1,6133af8bc4a7e1daa5fd78e1,Female,Russian Federation,2019-03-19,2
2,6133af8bc4a7e1daa5fd78e2,Male,Greece,2001-03-15,20
3,6133af8bc4a7e1daa5fd78e3,Male,French Guiana,1994-04-29,27
4,6133af8bc4a7e1daa5fd78e4,Female,Gambia,1978-01-21,43
5,6133af8bc4a7e1daa5fd78e5,Female,Saint Vincent and the Grenadines,1992-04-07,29
6,6133af8bc4a7e1daa5fd78e6,Female,Palau,2000-12-05,20
7,6133af8bc4a7e1daa5fd78e7,Male,Mauritius,2012-02-25,9
8,6133af8bc4a7e1daa5fd78e8,Male,Armenia,2012-02-23,9
9,6133af8bc4a7e1daa5fd78e9,Female,Anguilla,2018-11-27,2


In [362]:
# filter the ages above the age of 18
users_info.drop(users_info[ (users_info.age <= 18)].index, axis=0, inplace=True)

In [363]:
users_info.head(3)

Unnamed: 0,user_id,gender,countryOfOrigin,dateOfBirth,age
0,6133af8bc4a7e1daa5fd78e0,Female,Georgia,1995-04-26,26
2,6133af8bc4a7e1daa5fd78e2,Male,Greece,2001-03-15,20
3,6133af8bc4a7e1daa5fd78e3,Male,French Guiana,1994-04-29,27


In [364]:
# Gender coding => Female: 0 and Male: 1

users_info = users_info.replace(['Female', 'Male'], [0, 1])
users_info.head(3)

Unnamed: 0,user_id,gender,countryOfOrigin,dateOfBirth,age
0,6133af8bc4a7e1daa5fd78e0,0,Georgia,1995-04-26,26
2,6133af8bc4a7e1daa5fd78e2,1,Greece,2001-03-15,20
3,6133af8bc4a7e1daa5fd78e3,1,French Guiana,1994-04-29,27


In [365]:
# Countries Label Encoding 

label_encoder = preprocessing.LabelEncoder()
users_info['nationality']= label_encoder.fit_transform(users_info['countryOfOrigin']) 
users_info.head(3)

Unnamed: 0,user_id,gender,countryOfOrigin,dateOfBirth,age,nationality
0,6133af8bc4a7e1daa5fd78e0,0,Georgia,1995-04-26,26,30
2,6133af8bc4a7e1daa5fd78e2,1,Greece,2001-03-15,20,33
3,6133af8bc4a7e1daa5fd78e3,1,French Guiana,1994-04-29,27,27


In [366]:
# Dropping unnessary columns in products
products_info = products_df.drop(columns= ['image', 'thumbnailImgs', 'status'])


In [367]:
# Renaming in product's df

products_info = products_info.rename(index=str, columns= {'_id':'product_id'})
products_info.head(1)

Unnamed: 0,product_id,title,brand,price,quantity,discount,vendor,unitPrice,oldPrice,unitPriceQuantity,offerDuration,category,subcategory,badges,description,properties,ingredients,preparationInstruction,hints,manufacturer,nutritionalValues,created,modified
0,613515cdae5592de9045691b,Feines Gemüse-Allerlei 190g,BEBIVITA,0.65,190g Glas,0.0,Kaufland,0.65,0.0,"100 g = 0,35 €",,Baby & Kind,Baby- & Kindernahrung (350),,"Beste Zutaten aus der Natur - Karotten, Kartof...",Altersempfehlung:ab 6. MonatFüllmenge:190.0Fre...,"Gemüse 73% (Karotten, Kartoffeln, Erbsen), Was...",,Bei Flüssigkeitsabsatz bitte umrühren. Benötig...,Bebivita GmbH,,1630594239201,2021-09-03 18:00:51.834


In [368]:
# Dropping unnessary columns in bookmarks

bookmarks_info = bookmarks_df.drop(columns= ['_id','created', 'modified'])

In [369]:
# Renaming in bookmark's df
bookmarks_info = bookmarks_info.rename(index=str, columns= {'userId': 'user_id', 'productId' : 'product_id'})
bookmarks_info.head(3)

Unnamed: 0,user_id,active,product_id
0,6133af8dc4a7e1daa5fd7923,True,6132464b75e744d165bf2b08
1,6133af8dc4a7e1daa5fd7928,True,6132465275e744d165bf2ca1
2,6133af8ec4a7e1daa5fd794b,True,6132466275e744d165bf2faa


In [370]:
# Changing the datatypes (Important for inner Join)
bookmarks_info['user_id'] = bookmarks_info['user_id'].astype(str)
users_info['user_id'] = users_info['user_id'].astype(str)



# Complete DataFrame (Inner Join):

In [371]:
# Inner Join
df = pd.merge(users_info, bookmarks_info, on= 'user_id')
df.head(7)

Unnamed: 0,user_id,gender,countryOfOrigin,dateOfBirth,age,nationality,active,product_id
0,6133af8bc4a7e1daa5fd78e2,1,Greece,2001-03-15,20,33,True,6132462d75e744d165bf255f
1,6133af8bc4a7e1daa5fd78e2,1,Greece,2001-03-15,20,33,True,6132462375e744d165bf236c
2,6133af8bc4a7e1daa5fd78e2,1,Greece,2001-03-15,20,33,True,6132466975e744d165bf30ef
3,6133af8bc4a7e1daa5fd78e2,1,Greece,2001-03-15,20,33,True,6132465f75e744d165bf2f25
4,6133af8bc4a7e1daa5fd78e2,1,Greece,2001-03-15,20,33,True,6132462c75e744d165bf251e
5,6133af8bc4a7e1daa5fd78e3,1,French Guiana,1994-04-29,27,27,True,6132463275e744d165bf264f
6,6133af8bc4a7e1daa5fd78e3,1,French Guiana,1994-04-29,27,27,True,6132464c75e744d165bf2b79


In [372]:
# Counting the number of unique users
len(df.user_id.unique())

128

In [373]:
# Merging back the bookmarked products id to each specific users

df_grouped_id = df.groupby('user_id').agg({
    'gender':'first', 
    'nationality' : 'first',
    'age':'first',
    'product_id': ', '.join, 
     }).reset_index()             

In [374]:
df_grouped_id.shape

(128, 5)

In [375]:
# Final DF for using in ML
# Here having all unique users with their personal information and bookmarked items
df_grouped_id.head(3)

Unnamed: 0,user_id,gender,nationality,age,product_id
0,6133af8bc4a7e1daa5fd78e2,1,33,20,"6132462d75e744d165bf255f, 6132462375e744d165bf..."
1,6133af8bc4a7e1daa5fd78e3,1,27,27,"6132463275e744d165bf264f, 6132464c75e744d165bf..."
2,6133af8bc4a7e1daa5fd78e4,0,29,43,"6132462b75e744d165bf24f3, 6132466b75e744d165bf..."


In [376]:
df_grouped_id['product_id'][0]

'6132462d75e744d165bf255f, 6132462375e744d165bf236c, 6132466975e744d165bf30ef, 6132465f75e744d165bf2f25, 6132462c75e744d165bf251e'

# Machine Learning - Decision Tree

In [377]:
# Spliting the dataset -> Training and Testing

# X = Predictors (gender, nationality, age)
X = df_grouped_id.drop(columns=['user_id', 'product_id' ])
y = df_grouped_id['product_id']
# y = y.to_list()
# y = y.astype('str')



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)


model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
predictions


array(['6132464e75e744d165bf2bd7, 6132463475e744d165bf26cf, 6132465f75e744d165bf2f38',
       '6132465975e744d165bf2e01, 6132463675e744d165bf272b',
       '6132466c75e744d165bf31a9, 6132463775e744d165bf2763, 6132464275e744d165bf2974, 6132463a75e744d165bf27ed',
       '6132466275e744d165bf2fcc, 6132465075e744d165bf2c15',
       '6132466975e744d165bf30ff, 6132466775e744d165bf3086, 6132465275e744d165bf2cb1',
       '6132462475e744d165bf2397, 6132464d75e744d165bf2bc1, 6132462875e744d165bf2453, 6132463375e744d165bf2688',
       '6132464e75e744d165bf2bd7, 6132463475e744d165bf26cf, 6132465f75e744d165bf2f38',
       '6132465775e744d165bf2d99', '6132463475e744d165bf26ad',
       '6132465775e744d165bf2da2, 6132466975e744d165bf30ec',
       '6132463275e744d165bf2655',
       '6132464875e744d165bf2aaf, 6132464975e744d165bf2aed, 6132465575e744d165bf2d16',
       '6132464875e744d165bf2aa7, 6132464b75e744d165bf2b0e',
       '6132466b75e744d165bf316c, 6132463075e744d165bf2612, 6132464975e744d165bf2af0

In [378]:
# print(len(X)) -> 128
# print(len(y)) -> 128
# print(len(X_train)) -> 102
# print(len(y_train)) -> 102
# print(y_test) -> 26
# print(len(X_test)) -> 26



In [342]:

# score = accuracy_score(y_test, predictions)
# score

In [379]:
# cm = confusion_matrix(y_test, predictions)

In [380]:
# cm

In [381]:

# def accuracy(confusion_matrix):
#     diagonal_sum = confusion_matrix.trace()
#     sum_of_all_elements = confusion_matrix.sum()
#     return diagonal_sum / sum_of_all_elements 

In [382]:
#accuracy(cm)


In [383]:
joblib.dump(model, 'V1_product-recommender.joblib')


['V1_product-recommender.joblib']

In [384]:
model_saved = joblib.load('V1_product-recommender.joblib')

predictions = model_saved.predict(X_test)
predictions

array(['6132464e75e744d165bf2bd7, 6132463475e744d165bf26cf, 6132465f75e744d165bf2f38',
       '6132465975e744d165bf2e01, 6132463675e744d165bf272b',
       '6132466c75e744d165bf31a9, 6132463775e744d165bf2763, 6132464275e744d165bf2974, 6132463a75e744d165bf27ed',
       '6132466275e744d165bf2fcc, 6132465075e744d165bf2c15',
       '6132466975e744d165bf30ff, 6132466775e744d165bf3086, 6132465275e744d165bf2cb1',
       '6132462475e744d165bf2397, 6132464d75e744d165bf2bc1, 6132462875e744d165bf2453, 6132463375e744d165bf2688',
       '6132464e75e744d165bf2bd7, 6132463475e744d165bf26cf, 6132465f75e744d165bf2f38',
       '6132465775e744d165bf2d99', '6132463475e744d165bf26ad',
       '6132465775e744d165bf2da2, 6132466975e744d165bf30ec',
       '6132463275e744d165bf2655',
       '6132464875e744d165bf2aaf, 6132464975e744d165bf2aed, 6132465575e744d165bf2d16',
       '6132464875e744d165bf2aa7, 6132464b75e744d165bf2b0e',
       '6132466b75e744d165bf316c, 6132463075e744d165bf2612, 6132464975e744d165bf2af0

In [386]:
# Creat tree
# tree.export_graphviz(model, 
# out_file='V1_product-recommender.dot', 
# feature_names=['gender', 'age', 'nationality'], 
# class_names = sorted(y.unique()), 
# label='all', 
# rounded = True, 
# filled= True)