In [334]:
# Import Libraries

import re
import pickle
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [335]:
data = pd.read_csv('data_clean.csv')

## Prepocessing

In [336]:
# remove 'Customers find these' from customer_say
data['customer_say'] = data['customer_say'].str.replace('Customers find these', '')

# change 'No summary available.' to 'None' from customer_say
data['customer_say'] = data['customer_say'].str.replace('No summary available.', 'None')

# combine customer_say and individual_review to 'summary' column
data['summary'] = data['customer_say'] + ' ' + data['individual_review']

In [337]:
# cleaning
# lowercasing, remove non alphanumeric, remove whitespace
def textCleaning(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    return text

In [338]:
# clean summary column
data['summary'] = data['summary'].apply(textCleaning)

In [339]:
# lemmatize
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    else:
        return 'n'

#lemmatization with pos_tag
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tags]
    #lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_tokens

In [340]:
processed = data.copy()
#drop 'customer_say' and 'individual_review'
processed.drop(['customer_say', 'individual_review'], axis=1, inplace=True)

# make price and rating into float
# Extract the first numerical value from the 'price' column using regex
processed['price'] = processed['price'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)
# Extract the first numerical value from the 'rating' column using regex
processed['rating'] = processed['rating'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)

In [341]:
#clean brand column to only contain brand name without repetition
processed.loc[processed['brand'].str.contains('Adidas', na=False), 'brand'] = 'Adidas'
processed.loc[processed['brand'].str.contains('Nike', na=False), 'brand'] = 'Nike'
processed.loc[processed['brand'].str.contains('PUMA', na=False), 'brand'] = 'Puma'
processed.loc[processed['brand'].str.contains('New Balance', na=False), 'brand'] = 'New Balance'
processed.loc[processed['brand'].str.contains('Reebok', na=False), 'brand'] = 'Reebok'

In [342]:
# make new column 'token'
processed['token'] = processed['summary'].apply(lemmatize)

#combine token to lemmatized column
processed['lemmatized'] = processed['token'].apply(lambda x: ' '.join(x))

In [343]:
#save lemmatized data as corpus
corpus = processed['lemmatized']

## Modeling

In [344]:
# build tfidf vectorized with lemmatized data
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

In [345]:
tfidf_vectorizer

In [346]:
# get new sentence vector with tfidf model
def get_sentence_vector(sentence):
    tokens = lemmatize(sentence)
    sentence_vector = tfidf_vectorizer.transform([' '.join(tokens)])
    return sentence_vector

In [347]:
#get sentece vector for each summary
processed['sentence_vector'] = processed['lemmatized'].apply(get_sentence_vector)

In [348]:
#get only the product, rating, price, brand, gender, and sentence_vector for inference
inference = processed[['product', 'rating', 'price', 'brand', 'gender', 'sentence_vector']]

In [362]:
df_inference = processed[['product', 'rating', 'price', 'brand', 'gender']]

In [349]:
inference.brand.unique()

array(['Adidas', 'Nike', 'New Balance', 'Puma', 'Reebok'], dtype=object)

In [350]:
try_inference = {
    'input' :'shoes with high stability and good support',
    'brand' : ['Adidas', 'Puma'],
    'gender' : 'Women',
    'price_upper' : 100,
    'price_lower' : 0
}

In [351]:
#filter the dataset with 'try_inference' parameter
filtered = inference[(inference['brand'].isin(try_inference['brand'])) & (inference['gender'] == try_inference['gender']) & (inference['price'] >= try_inference['price_lower']) & (inference['price'] <= try_inference['price_upper'])].reset_index(drop=True)

#create sentence vector from input
input_vector = get_sentence_vector(try_inference['input'])

In [352]:
# calculate each sentence_vector similarity with input_vector with cosine_similarity
filtered['similarity'] = filtered['sentence_vector'].apply(lambda x: cosine_similarity(x, input_vector)[0][0])

In [353]:
# get top 10 data based on similarity
top_10 = filtered.sort_values(by='similarity', ascending=False).head(10)

In [354]:
top_10

Unnamed: 0,product,rating,price,brand,gender,sentence_vector,similarity
35,adidas Women's Crazychaos 2000,4.6,60.0,Adidas,Women,"(0, 8)\t0.19391173055157976\n (0, 183)\t0.1...",0.128516
19,PUMA Womens Prowl Slip-on Shine Wn'sCross Trainer,4.7,51.0,Puma,Women,"(0, 66)\t0.16962972980171534\n (0, 106)\t0....",0.116938
54,adidas Womens Cloudfoam Pure SportswearSneaker,4.3,40.0,Adidas,Women,"(0, 106)\t0.10137149064961777\n (0, 111)\t0...",0.111237
61,adidas Womens X_PLR Path,4.4,60.0,Adidas,Women,"(0, 360)\t0.14321936982557104\n (0, 397)\t0...",0.110624
25,PUMA Womens Twitch RunnerSneaker,4.4,30.0,Puma,Women,"(0, 103)\t0.0958763903706855\n (0, 106)\t0....",0.102828
44,adidas Women's Lite Racer 4.0,4.6,39.0,Adidas,Women,"(0, 106)\t0.10834307208320354\n (0, 111)\t0...",0.10011
9,PUMA Women's Riaze Prowl Cross Trainer,4.4,50.0,Puma,Women,"(0, 9)\t0.1522107327840787\n (0, 17)\t0.137...",0.098279
38,adidas Women's Eq21 Run,4.4,46.0,Adidas,Women,"(0, 17)\t0.2790196833192166\n (0, 39)\t0.21...",0.097259
8,PUMA Women's Pulse Pro,4.4,54.0,Puma,Women,"(0, 169)\t0.3216532699542198\n (0, 360)\t0....",0.09725
40,adidas Women's Kantai,4.7,80.0,Adidas,Women,"(0, 40)\t0.279156632048325\n (0, 83)\t0.103...",0.089491


In [355]:
# get dataframe from 'data' with the same 'product' as top_10
result = data[data['product'].isin(top_10['product'])].reset_index(drop=True)

In [364]:
#save df_inference as csv
df_inference.to_csv('inference.csv', index=False)