### Environment Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

In [None]:
# Loading data, creating dataframe
full_dataframe = pd.read_csv("amazon_co-ecommerce_sample.csv")
df = full_dataframe[['uniq_id', 'product_name', 'manufacturer', 'description', 'product_information', 'product_description', 'amazon_category_and_sub_category', 'customers_who_bought_this_item_also_bought', 'items_customers_buy_after_viewing_this_item']]
# df.head()

### Pre-Processing

* Dropping unneeded features
* Removing empty data
* Removing duplicates
* Remove embedded special characters
* Correct/Remove mispelt words
* Remove common words
* Tokenize by white space
* Stemming

In [None]:
# Drop rows with missing data:
df = df.dropna(how='any',axis=0) 
df.reset_index()
# df.shape

In [None]:
# Drop smaller categories with less than 20 items
df['cleaned_category'] = df['amazon_category_and_sub_category'].map(lambda x: x.split(">", 1)[0])
df = df.groupby(['cleaned_category']).filter(lambda x : len(x)>20)
df.groupby(['cleaned_category']).count()['product_name']

In [None]:
# Aggregating text columns for mining (note: assumes equal weights)
df['details'] = df['product_name'] + " " + df['description'] + " " + df['product_description'] + " " + df['product_information']

In [None]:
# Text mining: tokenize the key words

from preprocessor import *
from sklearn.utils import resample

df = resample(df, n_samples=100)  # Trimming data set because my CPU is dying

df['cleaned_data'] = df['details'].map(lambda s: preprocess(s))
# df['cleaned_data']


In [None]:
# # Finding the most frequent words
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer()
# transformed_data_count = vectorizer.fit_transform(df['cleaned_data'])
# temp = list(zip(vectorizer.get_feature_names_out(), np.ravel(transformed_data_count.sum(axis=0))))
# sorted(temp, key=lambda x: x[1])[::-1]


In [None]:
# Getting the TF-IDF and appending it to the main dataframe
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
transformed_data = vectorizer.fit_transform(df['cleaned_data'])

# Finding the most popular words according to TF-IDF
temp = list(zip(vectorizer.get_feature_names_out(), np.ravel(transformed_data.sum(axis=0))))
sorted(temp, key=lambda x: x[1])


### K-Means Model

In [None]:
# Load the IDF-TF into a dataframe where each feature is a word
from sklearn.preprocessing import scale

df_transformed_data = pd.DataFrame(transformed_data.toarray(), columns=vectorizer.get_feature_names_out())
df_idf = df.join([df_transformed_data]).fillna(0)
df_idf = df_idf.reset_index()
df_idf.drop(['index'], axis=1)

In [None]:
# Assigning each row to a cluster
from sklearn.cluster import KMeans

km = KMeans(n_clusters=14, random_state=1, max_iter=100, init='random')
model = km.fit(transformed_data)
labels = model.predict(transformed_data)

df_idf['cluster'] = labels

In [None]:
# Printing the contents of each cluster
clusters = {}
n = 0
for item in labels:
    if item in clusters:
        clusters[item].append(df_idf['product_name'][n])
    else:
        clusters[item] = [df_idf['product_name'][n]]
    n += 1

for item in clusters:
    print("Cluster ", item)
    for i in clusters[item]:
        print(i)

### Model Evaluation

Compare against the actual recommendations from Amazon, is the data part of the same cluster? What is the manhattan distance?

In [None]:
# Crawling: Extracting product titles from the URLs of suggested items
from crawler import *

In [None]:
# url_to_product_name(df['items_customers_buy_after_viewing_this_item'][0])

# df = full_dataframe[['product_name', 'manufacturer', 'description', 'product_information', 'product_description', 'amazon_category_and_sub_category', 'customers_who_bought_this_item_also_bought', 'items_customers_buy_after_viewing_this_item']][:10]

# Pre-Processing: Extracting the product names from the URL data
# for i in range(1):
#     if len(df['items_customers_buy_after_viewing_this_item'][i]) > 0:
#         df['items_customers_buy_after_viewing_this_item'][i] = url_to_product_name(df['items_customers_buy_after_viewing_this_item'][i])

# df['items_customers_buy_after_viewing_this_item']

