In [74]:
import config
import ssl

from pymongo import MongoClient

import pandas as pd
import numpy as np
from pandas import DataFrame

from datetime import datetime, date

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn import tree
from sklearn.metrics import confusion_matrix
# Import label encoder 
from sklearn import preprocessing


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Importing 3 Collections (Users - Bookmarks - Products)

In [75]:
mongo_url = config.MONGO_URL
client = MongoClient(mongo_url, ssl_cert_reqs=ssl.CERT_NONE)
db = client['vendors_data_db']

# Three different collections (1. Users 2. Products 3. Bookmarks)
collection_users = db.users_col
collection_product = db.product_col
collection_bookmarks = db.bookmarks_col

In [76]:
# changing the collections to Dataframes
users_df = pd.DataFrame(list(collection_users.find()))
products_df = pd.DataFrame(list(collection_product.find()))
bookmarks_df = pd.DataFrame(list(collection_bookmarks.find()))

In [77]:
# User dataframe
users_df.head(1)

Unnamed: 0,_id,firstName,lastName,gender,phone,email,password,countryOfOrigin,dateOfBirth
0,6133af8bc4a7e1daa5fd78e0,Lisa,Roach,Female,700.110.0724x387,Lisa.Roach@gmail.com,$2b$10$UXmmmXeFmsH36HwkwEEsFeCgTAdMlrcY2o4PeNK...,Georgia,1995-04-26


In [78]:
# Bookmark dataframe
bookmarks_df.head(1)

Unnamed: 0,_id,userId,active,created,productId,modified
0,6135b4cbf810533516c4639f,6133af8dc4a7e1daa5fd7923,True,2021-09-06 08:27:23,6132464b75e744d165bf2b08,


In [79]:
# Products dataframe
products_df.head(1)

Unnamed: 0,_id,hash_id,title,vendor,image,brand,content,price_per_unit,current_price,discount,old_price,offer_duration,category,subcategory,badges,thumbnail_imgs,description,properties,ingredients,preparation_inst,hints,manufacturer,nutritional_vals,status,insert_dt,processed_dt,update_dt
0,6132462375e744d165bf232c,240807383748605598,Gemüse und Hühnchen mit Reis 190g,Kaufland,https://media.kaufland.com/images/PPIM/AP_Prod...,BEBIVITA,190g Glas,"100 g = 0,45 €",0.85,0.0,0.0,,Baby & Kind,Baby- & Kindernahrung (350),,[https://media.kaufland.com/images/PPIM/AP_Pro...,Babynahrung mit den besten Zutaten aus der Nat...,Altersempfehlung:nach dem 4. MonatFüllmenge:19...,"Gemüse 52% (Karotten, Tomaten, Erbsen), Reis g...",Geöffnetes Glas oder benötigte Teilmenge im Wa...,Nicht erwärmten Rest verschlossen im Kühlschra...,Bebivita GmbH,,True,1630594239201,2021-09-03 17:58:27.017,2021-09-03 18:00:51.834


In [80]:
users_df.columns

Index(['_id', 'firstName', 'lastName', 'gender', 'phone', 'email', 'password',
       'countryOfOrigin', 'dateOfBirth'],
      dtype='object')

In [81]:
products_df.columns

Index(['_id', 'hash_id', 'title', 'vendor', 'image', 'brand', 'content',
       'price_per_unit', 'current_price', 'discount', 'old_price',
       'offer_duration', 'category', 'subcategory', 'badges', 'thumbnail_imgs',
       'description', 'properties', 'ingredients', 'preparation_inst', 'hints',
       'manufacturer', 'nutritional_vals', 'status', 'insert_dt',
       'processed_dt', 'update_dt'],
      dtype='object')

In [82]:
bookmarks_df.columns

Index(['_id', 'userId', 'active', 'created', 'productId', 'modified'], dtype='object')

# Data Cleaning and Data Preprocessing

In [83]:
# Dropping unnessary columns in users
users_info = users_df.drop(columns= ['firstName', 'lastName', 'phone' ,'email', 'password'])

In [84]:
# Renaming in user's df
users_info = users_info.rename(index=str, columns= {'_id':'user_id'})
users_info.head(1)

Unnamed: 0,user_id,gender,countryOfOrigin,dateOfBirth
0,6133af8bc4a7e1daa5fd78e0,Female,Georgia,1995-04-26


In [85]:
# Convert birth date to age 
def age(born):
    born = datetime.strptime(born, "%Y-%m-%d").date()
    today = date.today()
    return today.year - born.year - ((today.month, 
                                      today.day) < (born.month, 
                                                    born.day))
  
users_info['age'] = users_info['dateOfBirth'].apply(age)
  
users_info.head(10)

Unnamed: 0,user_id,gender,countryOfOrigin,dateOfBirth,age
0,6133af8bc4a7e1daa5fd78e0,Female,Georgia,1995-04-26,26
1,6133af8bc4a7e1daa5fd78e1,Female,Russian Federation,2019-03-19,2
2,6133af8bc4a7e1daa5fd78e2,Male,Greece,2001-03-15,20
3,6133af8bc4a7e1daa5fd78e3,Male,French Guiana,1994-04-29,27
4,6133af8bc4a7e1daa5fd78e4,Female,Gambia,1978-01-21,43
5,6133af8bc4a7e1daa5fd78e5,Female,Saint Vincent and the Grenadines,1992-04-07,29
6,6133af8bc4a7e1daa5fd78e6,Female,Palau,2000-12-05,20
7,6133af8bc4a7e1daa5fd78e7,Male,Mauritius,2012-02-25,9
8,6133af8bc4a7e1daa5fd78e8,Male,Armenia,2012-02-23,9
9,6133af8bc4a7e1daa5fd78e9,Female,Anguilla,2018-11-27,2


In [86]:
users_info.shape

(200, 5)

In [87]:
users_info.drop(users_info[ (users_info.age <= 18)].index, axis=0, inplace=True)


In [88]:
users_info.shape

(136, 5)

In [89]:
users_info.head(10)

Unnamed: 0,user_id,gender,countryOfOrigin,dateOfBirth,age
0,6133af8bc4a7e1daa5fd78e0,Female,Georgia,1995-04-26,26
2,6133af8bc4a7e1daa5fd78e2,Male,Greece,2001-03-15,20
3,6133af8bc4a7e1daa5fd78e3,Male,French Guiana,1994-04-29,27
4,6133af8bc4a7e1daa5fd78e4,Female,Gambia,1978-01-21,43
5,6133af8bc4a7e1daa5fd78e5,Female,Saint Vincent and the Grenadines,1992-04-07,29
6,6133af8bc4a7e1daa5fd78e6,Female,Palau,2000-12-05,20
10,6133af8cc4a7e1daa5fd78ea,Male,Brazil,1972-02-04,49
11,6133af8cc4a7e1daa5fd78eb,Male,United States Virgin Islands,2002-04-17,19
12,6133af8cc4a7e1daa5fd78ec,Male,Netherlands Antilles,1991-11-15,29
13,6133af8cc4a7e1daa5fd78ed,Female,British Indian Ocean Territory (Chagos Archipe...,1981-07-10,40


In [90]:
# Gender coding => Female: 0 and Male: 1

users_info = users_info.replace(['Female', 'Male'], [0, 1])
users_info.head(10)

Unnamed: 0,user_id,gender,countryOfOrigin,dateOfBirth,age
0,6133af8bc4a7e1daa5fd78e0,0,Georgia,1995-04-26,26
2,6133af8bc4a7e1daa5fd78e2,1,Greece,2001-03-15,20
3,6133af8bc4a7e1daa5fd78e3,1,French Guiana,1994-04-29,27
4,6133af8bc4a7e1daa5fd78e4,0,Gambia,1978-01-21,43
5,6133af8bc4a7e1daa5fd78e5,0,Saint Vincent and the Grenadines,1992-04-07,29
6,6133af8bc4a7e1daa5fd78e6,0,Palau,2000-12-05,20
10,6133af8cc4a7e1daa5fd78ea,1,Brazil,1972-02-04,49
11,6133af8cc4a7e1daa5fd78eb,1,United States Virgin Islands,2002-04-17,19
12,6133af8cc4a7e1daa5fd78ec,1,Netherlands Antilles,1991-11-15,29
13,6133af8cc4a7e1daa5fd78ed,0,British Indian Ocean Territory (Chagos Archipe...,1981-07-10,40


In [91]:
# Countries Label Encoding 

label_encoder = preprocessing.LabelEncoder()
users_info['nationality']= label_encoder.fit_transform(users_info['countryOfOrigin']) 
users_info.head(10)

Unnamed: 0,user_id,gender,countryOfOrigin,dateOfBirth,age,nationality
0,6133af8bc4a7e1daa5fd78e0,0,Georgia,1995-04-26,26,30
2,6133af8bc4a7e1daa5fd78e2,1,Greece,2001-03-15,20,33
3,6133af8bc4a7e1daa5fd78e3,1,French Guiana,1994-04-29,27,27
4,6133af8bc4a7e1daa5fd78e4,0,Gambia,1978-01-21,43,29
5,6133af8bc4a7e1daa5fd78e5,0,Saint Vincent and the Grenadines,1992-04-07,29,80
6,6133af8bc4a7e1daa5fd78e6,0,Palau,2000-12-05,20,71
10,6133af8cc4a7e1daa5fd78ea,1,Brazil,1972-02-04,49,13
11,6133af8cc4a7e1daa5fd78eb,1,United States Virgin Islands,2002-04-17,19,97
12,6133af8cc4a7e1daa5fd78ec,1,Netherlands Antilles,1991-11-15,29,62
13,6133af8cc4a7e1daa5fd78ed,0,British Indian Ocean Territory (Chagos Archipe...,1981-07-10,40,14


In [92]:
# Dropping unnessary columns in products
products_info = products_df.drop(columns= ['image', 'thumbnail_imgs', 'status', 'insert_dt'])


In [93]:
# Renaming in product's df

products_info = products_info.rename(index=str, columns= {'_id':'product_id'})
products_info.head(1)

Unnamed: 0,product_id,hash_id,title,vendor,brand,content,price_per_unit,current_price,discount,old_price,offer_duration,category,subcategory,badges,description,properties,ingredients,preparation_inst,hints,manufacturer,nutritional_vals,processed_dt,update_dt
0,6132462375e744d165bf232c,240807383748605598,Gemüse und Hühnchen mit Reis 190g,Kaufland,BEBIVITA,190g Glas,"100 g = 0,45 €",0.85,0.0,0.0,,Baby & Kind,Baby- & Kindernahrung (350),,Babynahrung mit den besten Zutaten aus der Nat...,Altersempfehlung:nach dem 4. MonatFüllmenge:19...,"Gemüse 52% (Karotten, Tomaten, Erbsen), Reis g...",Geöffnetes Glas oder benötigte Teilmenge im Wa...,Nicht erwärmten Rest verschlossen im Kühlschra...,Bebivita GmbH,,2021-09-03 17:58:27.017,2021-09-03 18:00:51.834


In [94]:
# Dropping unnessary columns in bookmarks

bookmarks_info = bookmarks_df.drop(columns= ['created', 'modified'])

In [95]:
# Renaming in bookmark's df
bookmarks_info = bookmarks_info.rename(index=str, columns= {'_id':'bookmarks_id', 'userId': 'user_id', 'productId' : 'product_id'})
bookmarks_info.head(3)

Unnamed: 0,bookmarks_id,user_id,active,product_id
0,6135b4cbf810533516c4639f,6133af8dc4a7e1daa5fd7923,True,6132464b75e744d165bf2b08
1,6135b4cbf810533516c463a0,6133af8dc4a7e1daa5fd7928,True,6132465275e744d165bf2ca1
2,6135b4cbf810533516c463a1,6133af8ec4a7e1daa5fd794b,True,6132466275e744d165bf2faa


In [96]:
bookmarks_info.active.value_counts()


True    600
Name: active, dtype: int64

In [97]:
# Changing the datatypes (Important for inner Join)
bookmarks_info['user_id'] = bookmarks_info['user_id'].astype(str)
users_info['user_id'] = users_info['user_id'].astype(str)



# Complete DataFrame (Inner Join):

In [98]:
# Inner Join
df = pd.merge(users_info, bookmarks_info, on= 'user_id')
df.head(7)

Unnamed: 0,user_id,gender,countryOfOrigin,dateOfBirth,age,nationality,bookmarks_id,active,product_id
0,6133af8bc4a7e1daa5fd78e2,1,Greece,2001-03-15,20,33,6135b4cbf810533516c463bd,True,6132462d75e744d165bf255f
1,6133af8bc4a7e1daa5fd78e2,1,Greece,2001-03-15,20,33,6135b4cbf810533516c463c7,True,6132462375e744d165bf236c
2,6133af8bc4a7e1daa5fd78e2,1,Greece,2001-03-15,20,33,6135b4d4f810533516c46567,True,6132466975e744d165bf30ef
3,6133af8bc4a7e1daa5fd78e2,1,Greece,2001-03-15,20,33,6135b4d5f810533516c465b1,True,6132465f75e744d165bf2f25
4,6133af8bc4a7e1daa5fd78e2,1,Greece,2001-03-15,20,33,6135b4d7f810533516c465f1,True,6132462c75e744d165bf251e
5,6133af8bc4a7e1daa5fd78e3,1,French Guiana,1994-04-29,27,27,6135b4cbf810533516c463b0,True,6132463275e744d165bf264f
6,6133af8bc4a7e1daa5fd78e3,1,French Guiana,1994-04-29,27,27,6135b4cff810533516c4647d,True,6132464c75e744d165bf2b79


In [99]:
user_id_list = []
user_bookmarked_list = []

for _ , row in df.iterrows():
    if row['user_id'] not in user_id_list:
        user_id_list.append(row['user_id'])
        
print(len(user_id_list))

128


In [100]:
len(df.user_id.unique())

128

In [101]:
df_grouped_id = df.groupby('user_id').agg({
    'gender':'first', 
    'nationality' : 'first',
    'age':'first',
    'product_id': ', '.join, 
     }).reset_index()
                             

In [102]:
df_grouped_id.shape

(128, 5)

In [103]:
df_grouped_id.head(10)

Unnamed: 0,user_id,gender,nationality,age,product_id
0,6133af8bc4a7e1daa5fd78e2,1,33,20,"6132462d75e744d165bf255f, 6132462375e744d165bf..."
1,6133af8bc4a7e1daa5fd78e3,1,27,27,"6132463275e744d165bf264f, 6132464c75e744d165bf..."
2,6133af8bc4a7e1daa5fd78e4,0,29,43,"6132462b75e744d165bf24f3, 6132466b75e744d165bf..."
3,6133af8bc4a7e1daa5fd78e5,0,80,29,"6132465775e744d165bf2da3, 6132466775e744d165bf..."
4,6133af8bc4a7e1daa5fd78e6,0,71,20,"6132462a75e744d165bf24be, 6132464275e744d165bf..."
5,6133af8cc4a7e1daa5fd78ea,1,13,49,"6132465275e744d165bf2cb3, 6132466f75e744d165bf..."
6,6133af8cc4a7e1daa5fd78eb,1,97,19,"6132466775e744d165bf3086, 6132465975e744d165bf..."
7,6133af8cc4a7e1daa5fd78ec,1,62,29,"6132463e75e744d165bf28d8, 6132466a75e744d165bf..."
8,6133af8cc4a7e1daa5fd78ed,0,14,40,"6132465075e744d165bf2c3d, 6132465d75e744d165bf..."
9,6133af8cc4a7e1daa5fd78ee,0,53,40,"6132463775e744d165bf2738, 6132466075e744d165bf..."


# Machine Learning_without Grouping

In [104]:
X = df.drop(columns=['user_id', 'countryOfOrigin', 'dateOfBirth', 'active', 'product_id', 'bookmarks_id'])
y = df['product_id']
y = y.astype('str')


model = DecisionTreeClassifier()
model.fit(X, y)
predictions = model.predict([[1, 20, 45],[0, 2, 45]])
predictions

array(['6132462e75e744d165bf258f', '6132462975e744d165bf2483'],
      dtype=object)

In [105]:
# Spliting the dataset -> Training and Testing

X = df.drop(columns=['user_id', 'countryOfOrigin', 'dateOfBirth','active' ,'product_id', 'bookmarks_id' ])
y = df['product_id']
y = y.astype('str')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)


model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(predictions)

# score = accuracy_score(y_test, predictions)
# score


['6132463875e744d165bf277e' '6132463b75e744d165bf282f'
 '6132463475e744d165bf269c' '6132462375e744d165bf2337'
 '6132462975e744d165bf24a4' '6132463775e744d165bf2763'
 '6132463175e744d165bf2622' '6132465875e744d165bf2db4'
 '6132463775e744d165bf2741' '6132463b75e744d165bf2810'
 '6132464c75e744d165bf2b50' '6132462d75e744d165bf255f'
 '6132462775e744d165bf242f' '6132464c75e744d165bf2b83'
 '6132463a75e744d165bf27f7' '6132464f75e744d165bf2c09'
 '6132465275e744d165bf2ccb' '6132464875e744d165bf2aaf'
 '6132464275e744d165bf2978' '6132464875e744d165bf2aa7'
 '6132463175e744d165bf2642' '6132463175e744d165bf262e'
 '6132462f75e744d165bf25b9' '6132464c75e744d165bf2b79'
 '6132464675e744d165bf2a44' '6132463b75e744d165bf2810'
 '6132464775e744d165bf2a79' '6132463475e744d165bf26b3'
 '6132464b75e744d165bf2b28' '6132462375e744d165bf2337'
 '6132464875e744d165bf2abc' '6132462775e744d165bf242f'
 '6132465775e744d165bf2da3' '6132462375e744d165bf234d'
 '6132462375e744d165bf2335' '6132463175e744d165bf2642'
 '61324637

In [106]:
# confusion_matrix(y_test, predictions)

In [107]:
# Save ML 

# X = df.drop(columns=['user_id', 'countryOfOrigin', 'dateOfBirth','active' ,'product_id' ])
# y = df['product_id']
# y = y.astype('str')
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)


# model = DecisionTreeClassifier()
# model.fit(X_train, y_train)

# Store our trained model in a file
# joblib.dump(model, 'product-recommender.joblib')

# predictions = model.predict(X_test)
# predictions

# score = accuracy_score(y_test, predictions)
# score

In [108]:
# load the trained model and make predictions
# model_saved = joblib.load('product-recommender.joblib')

# predictions = model_saved.predict(X_test)
# #predictions

In [109]:
# Creat tree
# tree.export_graphviz(model, 
# out_file='product-recommender.dot', 
# feature_names=['gender', 'age', 'nationality'], 
# class_names = sorted(y.unique()), 
# label='all', 
# rounded = True, 
# filled= True)

# Machine Learning - With Grouped User ID

In [110]:
df_grouped_id.head(10)

Unnamed: 0,user_id,gender,nationality,age,product_id
0,6133af8bc4a7e1daa5fd78e2,1,33,20,"6132462d75e744d165bf255f, 6132462375e744d165bf..."
1,6133af8bc4a7e1daa5fd78e3,1,27,27,"6132463275e744d165bf264f, 6132464c75e744d165bf..."
2,6133af8bc4a7e1daa5fd78e4,0,29,43,"6132462b75e744d165bf24f3, 6132466b75e744d165bf..."
3,6133af8bc4a7e1daa5fd78e5,0,80,29,"6132465775e744d165bf2da3, 6132466775e744d165bf..."
4,6133af8bc4a7e1daa5fd78e6,0,71,20,"6132462a75e744d165bf24be, 6132464275e744d165bf..."
5,6133af8cc4a7e1daa5fd78ea,1,13,49,"6132465275e744d165bf2cb3, 6132466f75e744d165bf..."
6,6133af8cc4a7e1daa5fd78eb,1,97,19,"6132466775e744d165bf3086, 6132465975e744d165bf..."
7,6133af8cc4a7e1daa5fd78ec,1,62,29,"6132463e75e744d165bf28d8, 6132466a75e744d165bf..."
8,6133af8cc4a7e1daa5fd78ed,0,14,40,"6132465075e744d165bf2c3d, 6132465d75e744d165bf..."
9,6133af8cc4a7e1daa5fd78ee,0,53,40,"6132463775e744d165bf2738, 6132466075e744d165bf..."


In [112]:
X = df_grouped_id.drop(columns=['user_id', 'product_id'])
y = df_grouped_id['product_id']
y = y.astype('str')


model = DecisionTreeClassifier()
model.fit(X, y)
predictions = model.predict([[1, 20, 45],[0, 2, 45]])
predictions

array(['6132463e75e744d165bf28d3, 6132467075e744d165bf3277, 6132463475e744d165bf26b4',
       '6132465a75e744d165bf2e40, 6132463b75e744d165bf2814, 6132466a75e744d165bf315f, 6132463475e744d165bf2697, 6132465c75e744d165bf2e84'],
      dtype=object)

In [113]:
# Spliting the dataset -> Training and Testing

X = df_grouped_id.drop(columns=['user_id', 'product_id' ])
y = df_grouped_id['product_id']
y = y.astype('str')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)


model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
predictions


array(['6132466d75e744d165bf31e7, 6132464c75e744d165bf2b83, 6132466175e744d165bf2f6d',
       '6132466175e744d165bf2f84, 6132463275e744d165bf264f',
       '6132466575e744d165bf3046, 6132465575e744d165bf2d01',
       '6132465975e744d165bf2e01, 6132463675e744d165bf272b',
       '6132464775e744d165bf2a79',
       '6132466d75e744d165bf31d2, 6132462975e744d165bf2483, 6132466d75e744d165bf31d0, 6132466a75e744d165bf3135, 6132463875e744d165bf277e',
       '6132464875e744d165bf2ab9, 6132465175e744d165bf2c6f, 6132464d75e744d165bf2bbf, 6132465675e744d165bf2d4a, 6132464375e744d165bf2995',
       '6132464875e744d165bf2aa7, 6132464b75e744d165bf2b0e',
       '6132464875e744d165bf2aa7, 6132464b75e744d165bf2b0e',
       '6132466575e744d165bf305d, 6132463e75e744d165bf28ce, 6132463475e744d165bf269c, 6132464b75e744d165bf2b14, 6132464375e744d165bf2992',
       '6132464875e744d165bf2abc, 6132465575e744d165bf2d18, 6132466175e744d165bf2f53, 6132466575e744d165bf304f',
       '6132466d75e744d165bf31e7, 6132464c7

In [114]:
joblib.dump(model, 'product-recommender.joblib')


['product-recommender.joblib']

In [115]:
model_saved = joblib.load('product-recommender.joblib')

predictions = model_saved.predict(X_test)
#predictions

In [116]:
# Creat tree
tree.export_graphviz(model, 
out_file='product-recommender.dot', 
feature_names=['gender', 'age', 'nationality'], 
class_names = sorted(y.unique()), 
label='all', 
rounded = True, 
filled= True)