### Environment Setup

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

In [4]:
# Loading data, creating dataframe
full_dataframe = pd.read_csv("amazon_co-ecommerce_sample.csv")
df = full_dataframe[['uniq_id', 'product_name', 'manufacturer', 'description', 'product_information', 'product_description', 'amazon_category_and_sub_category', 'customers_who_bought_this_item_also_bought', 'items_customers_buy_after_viewing_this_item']]
# df.head()

### Pre-Processing

* Dropping unneeded features
* Removing empty data
* Removing duplicates
* Remove embedded special characters
* Correct/Remove mispelt words
* Remove common words
* Tokenize by white space
* Stemming

In [5]:
# Drop rows with missing data:
df = df.dropna(how='any',axis=0) 
df.reset_index()
# df.shape

Unnamed: 0,index,uniq_id,product_name,manufacturer,description,product_information,product_description,amazon_category_and_sub_category,customers_who_bought_this_item_also_bought,items_customers_buy_after_viewing_this_item
0,0,eac7efa5dbd3d667f26eb3d3ab504464,Hornby 2014 Catalogue,Hornby,Product Description Hornby 2014 Catalogue Box ...,Technical Details Item Weight640 g Product Dim...,Product Description Hornby 2014 Catalogue Box ...,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Hornby-R8150-Catalogue...,http://www.amazon.co.uk/Hornby-R8150-Catalogue...
1,1,b17540ef7e86e461d37f3ae58b7b72ac,FunkyBuys® Large Christmas Holiday Express Fes...,FunkyBuys,Size Name:Large FunkyBuys® Large Christmas Hol...,Technical Details Manufacturer recommended age...,Size Name:Large FunkyBuys® Large Christmas Hol...,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Christmas-Holiday-Expr...,http://www.amazon.co.uk/Christmas-Holiday-Expr...
2,2,348f344247b0c1a935b1223072ef9d8a,CLASSIC TOY TRAIN SET TRACK CARRIAGES LIGHT EN...,ccf,BIG CLASSIC TOY TRAIN SET TRACK CARRIAGE LIGHT...,Technical Details Manufacturer recommended age...,BIG CLASSIC TOY TRAIN SET TRACK CARRIAGE LIGHT...,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Classic-Train-Lights-B...,http://www.amazon.co.uk/Train-With-Tracks-Batt...
3,4,e33a9adeed5f36840ccc227db4682a36,Hornby 00 Gauge 0-4-0 Gildenlow Salt Co. Steam...,Hornby,Product Description Hornby RailRoad 0-4-0 Gild...,Technical Details Item Weight159 g Product Dim...,Product Description Hornby RailRoad 0-4-0 Gild...,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Hornby-R6367-RailRoad-...,http://www.amazon.co.uk/Hornby-R2672-RailRoad-...
4,5,cb34f0a84102c1ebc3ef6892d7444d36,20pcs Model Garden Light Double Heads Lamppost...,Generic,These delicate model garden lights are mainly ...,Technical Details Manufacturer recommended age...,These delicate model garden lights are mainly ...,Hobbies > Model Trains & Railway Sets > Lighti...,http://www.amazon.co.uk/Single-Head-Garden-Lig...,http://www.amazon.co.uk/Single-Head-Garden-Lig...
...,...,...,...,...,...,...,...,...,...,...
5952,9985,df6b6fa9e9d4d0994ac3fa67cd3ded71,Master Replicas - Clone Trooper Helmet Scaled ...,Master Replicas,s the Clone Wars raged toward its tumultuous c...,Technical Details Item Weight1.2 Kg Product Di...,s the Clone Wars raged toward its tumultuous c...,Hobbies > Collectible Figures & Memorabilia > ...,http://www.amazon.co.uk/X-Wing-Pilot-Helmet-Sc...,http://www.amazon.co.uk/Master-Replicas-X-Wing...
5953,9987,6b9c92678116a53b8d5a656b64cbcabb,Marauder's map wallscroll - Harry Potter,GGS,This wallscroll will help you to find your way...,Technical Details Item Weight159 g Product Dim...,This wallscroll will help you to find your way...,Characters & Brands > Harry Potter > Toys,http://www.amazon.co.uk/Harry-Potter-Golden-Sn...,http://www.amazon.co.uk/Harry-Potter-Wizarding...
5954,9991,cf75a470360f08eaac9e4d9882999cee,Iron Maiden 8-Inch Eddie 2 Mintutes To Midnigh...,IronMan,Product Description Straight from the cover of...,Technical Details Item Weight181 g Product Dim...,Product Description Straight from the cover of...,Hobbies > Collectible Figures & Memorabilia > ...,http://www.amazon.co.uk/Star-Images-Maiden-Clo...,http://www.amazon.co.uk/Maiden-8-Inch-Eddie-Cl...
5955,9993,fa13bf1bd4a3a98b990a4ee64dcf6eca,Star Wars The Clone Wars CW01 Captain Rex 3.75...,Star Wars,Play the Galactic Battle Game with your favori...,Technical Details Item Weight68 g Product Dime...,Play the Galactic Battle Game with your favori...,Characters & Brands > Star Wars > Toys,http://www.amazon.co.uk/Star-Wars-Clone-Action...,http://www.amazon.co.uk/Star-Wars-Clone-Action...


In [6]:
# Drop smaller categories with less than 20 items
df['cleaned_category'] = df['amazon_category_and_sub_category'].map(lambda x: x.split(">", 1)[0])
df = df.groupby(['cleaned_category']).filter(lambda x : len(x)>20)
df.groupby(['cleaned_category']).count()['product_name']

cleaned_category
Arts & Crafts                 383
Baby & Toddler Toys            88
Characters & Brands           682
Die-Cast & Toy Vehicles       730
Dolls & Accessories           174
Fancy Dress                   395
Figures & Playsets            823
Games                         759
Hobbies                       784
Jigsaws & Puzzles             211
Party Supplies                332
Pretend Play                   31
Puppets & Puppet Theatres     221
Sports Toys & Outdoor         283
Name: product_name, dtype: int64

In [7]:
# Aggregating text columns for mining (note: assumes equal weights)
df['details'] = df['product_name'] + " " + df['description'] + " " + df['product_description'] + " " + df['product_information']

In [8]:
# Text mining: tokenize the key words

from preprocessor import *
from sklearn.utils import resample

df = resample(df, n_samples=100)  # Trimming data set because my CPU is dying

df['cleaned_data'] = df['details'].map(lambda s: preprocess(s))
# df['cleaned_data']


In [9]:
# # Finding the most frequent words
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer()
# transformed_data_count = vectorizer.fit_transform(df['cleaned_data'])
# temp = list(zip(vectorizer.get_feature_names_out(), np.ravel(transformed_data_count.sum(axis=0))))
# sorted(temp, key=lambda x: x[1])[::-1]


In [94]:
# Getting the TF-IDF and appending it to the main dataframe
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
transformed_data = vectorizer.fit_transform(df['cleaned_data'])

# Finding the most popular words according to TF-IDF
temp = list(zip(vectorizer.get_feature_names_out(), np.ravel(transformed_data.sum(axis=0))))
sorted(temp, key=lambda x: x[1])


[('rex', 0.044555511143031315),
 ('badger', 0.051924211496563656),
 ('fabric', 0.051924211496563656),
 ('cupboard', 0.05471726566177081),
 ('dri', 0.05471726566177081),
 ('thunderbird', 0.06104918400372137),
 ('anywher', 0.06292530745924312),
 ('choic', 0.06292530745924312),
 ('deliv', 0.06292530745924312),
 ('flyer', 0.06292530745924312),
 ('mind', 0.06292530745924312),
 ('neat', 0.06292530745924312),
 ('object', 0.06292530745924312),
 ('portabl', 0.06292530745924312),
 ('session', 0.06292530745924312),
 ('stunt', 0.06292530745924312),
 ('amazingli', 0.06316708863581254),
 ('astonish', 0.06316708863581254),
 ('bishop', 0.06316708863581254),
 ('blood', 0.06316708863581254),
 ('born', 0.06316708863581254),
 ('boxwood', 0.06316708863581254),
 ('collector', 0.06316708863581254),
 ('copyright', 0.06316708863581254),
 ('difficult', 0.06316708863581254),
 ('facial', 0.06316708863581254),
 ('fit', 0.06316708863581254),
 ('flow', 0.06316708863581254),
 ('hallmark', 0.06316708863581254),
 ('ind

### K-Means Model

In [11]:
# Load the IDF-TF into a dataframe where each feature is a word
from sklearn.preprocessing import scale

df_transformed_data = pd.DataFrame(transformed_data.toarray(), columns=vectorizer.get_feature_names_out())
df_idf = df.join([df_transformed_data]).fillna(0)
df_idf = df_idf.reset_index()
df_idf.drop(['index'], axis=1)

Unnamed: 0,uniq_id,product_name,manufacturer,description,product_information,product_description,amazon_category_and_sub_category,customers_who_bought_this_item_also_bought,items_customers_buy_after_viewing_this_item,cleaned_category,...,worn,would,wrap,wrist,ye,year,yellow,young,zero,zombi
0,7e2aa2b4596a39ba852449718413d7cc,Hornby Gauge Western Express Digital Train Set...,Hornby,Western Express Digital Train Set with eLink a...,Technical Details Item Weight2.3 Kg Product Di...,Western Express Digital Train Set with eLink a...,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Hornby-Western-Master-...,http://www.amazon.co.uk/Hornby-Western-Master-...,Hobbies,...,0.0,0.028776,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,c3b2f6ec9cf6250c960c26ee8ad33509,Power Trains Freight Industrial (Pack of 4),Power Trains,Product Description 4 collectible cars that ar...,Technical Details Item Weight458 g Product Dim...,Product Description 4 collectible cars that ar...,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Power-Trains-Auto-Load...,http://www.amazon.co.uk/Power-Trains-Auto-Load...,Hobbies,...,0.0,0.006585,0.0,0.125851,0.0,0.0,0.0,0.0,0.0,0.0
2,142f98f4a806c7c5d7bdfc3f2417e865,Hornby '00' Gauge R4526 RailRoad Night Mail Op...,Hornby,Product Description RailRoad Night Mail Operat...,Technical Details Item Weight240 g Product Dim...,Product Description RailRoad Night Mail Operat...,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Hornby-R4524-RailRoad-...,http://www.amazon.co.uk/Hornby-R8221-Gauge-Tra...,Hobbies,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,28c216024887617838117d5ebf57765f,Thomas and Friends Take-n-play(TM) Die-cast Sp...,Fisher Price,SPENCER HEAVY HAUL (Take n Play) Prodotto sped...,Technical Details Item Weight440 g Product Dim...,SPENCER HEAVY HAUL (Take n Play) Prodotto sped...,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Thomas-and-Friends-Tak...,http://www.amazon.co.uk/Thomas-Friends-Take-n-...,Hobbies,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,f1786d361c84a0a7919ff24534a9d8a2,40 dice - 12mm 6 sided spot dice MIXED COLOURS,The Dice Place,Great value! A pack of 40 dice ideal for use i...,Technical Details Product Dimensions1.2 x 1.2 ...,Great value! A pack of 40 dice ideal for use i...,Games > Dice & Dice Games,http://www.amazon.co.uk/BigCherry-Dice-Round-C...,http://www.amazon.co.uk/BigCherry-Dice-Round-C...,Games,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,246e92bedfc3336dfe1cc524193b8526,Melissa & Doug Shape Sequence Sorting Set,Melissa & Doug,"Product Description Recognizing colour, shape ...",Technical Details Item Weight5 g Product Dimen...,"Product Description Recognizing colour, shape ...",Games > Educational Games,http://www.amazon.co.uk/Melissa-and-Doug-Geome...,http://www.amazon.co.uk/Melissa-Doug-Pound-Rol...,Games,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
96,f6c1b598ed38c2af7b9e045778988ffc,Breaking Bad Bobblehead Walter White - Heisenberg,Merchandise 24/7,Breaking Bad Bobblehead Walter White - Heisenb...,Technical Details Item Weight240 g Product Dim...,Breaking Bad Bobblehead Walter White - Heisenb...,Hobbies > Collectible Figures & Memorabilia > ...,http://www.amazon.co.uk/Breaking-Bad-inch-Pink...,http://www.amazon.co.uk/Breaking-Bad-inch-Pink...,Hobbies,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
97,99cf3a94877f9f6bd25ece43a3113cfc,My Little Pony POP Pinkie Pie Figure,My Little Pony,Product Description Design a pony your way wit...,Technical Details Item Weight100 g Product Dim...,Product Description Design a pony your way wit...,Hobbies > Collectible Figures & Memorabilia > ...,http://www.amazon.co.uk/My-Little-Pony-Rarity-...,http://www.amazon.co.uk/My-Little-Pony-Princes...,Hobbies,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
98,c8eeabf3d6892531c3170f0ff4d649b7,HAMA BEADS FROG SINGLE PEGBOARD NO.225,Hama Beads,"Midi Pegboards, size 11,5x12x0,5 cm, 10 pcs",Technical Details Item Weight240 g Product Dim...,"Midi Pegboards, size 11,5x12x0,5 cm, 10 pcs",Arts & Crafts > Children's Craft Kits > Bead A...,http://www.amazon.co.uk/BEADS-DRAGON-SINGLE-PE...,http://www.amazon.co.uk/Hama-Beads-10-000-Buck...,Arts & Crafts,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# Assigning each row to a cluster
from sklearn.cluster import KMeans

km = KMeans(n_clusters=14, random_state=1, max_iter=100, init='random', n_init=10)
model = km.fit(transformed_data)
labels = model.predict(transformed_data)

df_idf['cluster'] = labels

array([11,  2, 10,  9,  4, 13,  8,  8,  5,  0,  9,  5,  7,  8,  6,  8, 13,
        0,  7, 10,  8,  8,  9, 10,  0,  5,  3,  0,  7, 12,  1,  0,  7, 12,
        2,  3,  0,  6,  6,  7,  0,  0,  0, 10,  0,  0,  5,  8,  6,  8, 10,
        8,  8,  7,  8,  8,  7,  4,  0,  8,  8,  7, 12, 11,  6,  8,  0, 11,
        1,  0,  8,  3,  0,  7, 11,  8,  9,  8,  7, 12,  7,  8,  7,  8,  8,
        5,  8,  8, 10,  6,  7,  7,  8,  8,  9,  7,  1,  8,  7, 13])

In [13]:
# Printing the contents of each cluster
clusters = {}
n = 0
for item in labels:
    if item in clusters:
        clusters[item].append(df_idf['product_name'][n])
    else:
        clusters[item] = [df_idf['product_name'][n]]
    n += 1

for item in clusters:
    print("Cluster ", item)
    for i in clusters[item]:
        print(i)

Cluster  11
Hornby Gauge Western Express Digital Train Set with eLink and TTS Loco Train Set
Music Superstars ~ 3 inch Scale ~ Kiss Band Set
Vollter Parrot AR Drone 2.0 RC Quadcopter Mounting Mount Tools Kit Original Electronic Accessory
Barbie Sisters Wave Ride
Cluster  2
Power Trains Freight Industrial (Pack of 4)
Rabbit Ride On Shape Sorter Dexterity Cart
Cluster  10
Hornby '00' Gauge R4526 RailRoad Night Mail Operating Mail - GWR Chocolate and Cream Coach
Hot Wheels 17 " Foil Helium Balloon
TAMIYA Military Kit 1:48 32571 Russian Heavy Tank JS-2 Model 1
Black and White Check Flag Race Bunting 10 metres
Andalusian Limited Edition Red Sandal Wood Chess Set 4.25 Inch King- Inspired by Staunton TM
Playmobil Pirate Dinghy
Cluster  9
Thomas and Friends Take-n-play(TM) Die-cast Spencer And The Heavy Load
Bestway 120 x 72 x 22-inches Deluxe Family Pool
Hot Wheels Star Wars Character Car 2-Pack, C-3PO and R2-D2
22 Silicone Reborn Baby Girl Realistic Newborn Baby Doll for Toddlers Xmas Gift
T

In [101]:
user_input = "test apple fun robot cards games toy machine robot large white walter breaking bobblehead"
user_input_cleaned = pd.DataFrame([preprocess(user_input)], columns=["cleaned_data"])
user_input_vectorized = vectorizer.transform(user_input_cleaned["cleaned_data"])

In [102]:
from sklearn.neighbors import KNeighborsRegressor

X_train = transformed_data
y_train = df_idf['cluster']

regressor = KNeighborsRegressor(n_neighbors=5)
regressor.fit(X_train, y_train)

y_pred = regressor.predict(user_input_vectorized)
y_pred


array([4.4])

### Model Evaluation

Compare against the actual recommendations from Amazon, is the data part of the same cluster? What is the manhattan distance?

In [None]:
# Crawling: Extracting product titles from the URLs of suggested items
from crawler import *

In [None]:
# url_to_product_name(df['items_customers_buy_after_viewing_this_item'][0])

# df = full_dataframe[['product_name', 'manufacturer', 'description', 'product_information', 'product_description', 'amazon_category_and_sub_category', 'customers_who_bought_this_item_also_bought', 'items_customers_buy_after_viewing_this_item']][:10]

# Pre-Processing: Extracting the product names from the URL data
# for i in range(1):
#     if len(df['items_customers_buy_after_viewing_this_item'][i]) > 0:
#         df['items_customers_buy_after_viewing_this_item'][i] = url_to_product_name(df['items_customers_buy_after_viewing_this_item'][i])

# df['items_customers_buy_after_viewing_this_item']

