In [553]:
import json
from collections import defaultdict
import gzip
import pandas as pd
from lxml import html,etree
import numpy as np
import ipywidgets as widgets
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer

# set stopwords vocabulary
nltk.download('stopwords')

# set tokenizer
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ariannabianchi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ariannabianchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [554]:
### load the meta data
data = []
with gzip.open('Dataset/meta_Software.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print("Total number of items in the dataset: ", len(data))

# first row of the list
print(data[0])

Total number of items in the dataset:  26790
{'category': [], 'tech1': '', 'description': [], 'fit': '', 'title': 'HOLT PHYSICS LESSON PRESENTATION CD-ROM QUICK CONCEPTS', 'also_buy': [], 'tech2': '', 'brand': 'HOLT. RINEHART AND WINSTON', 'feature': [], 'rank': '25,550 in Software (', 'also_view': [], 'main_cat': 'Software', 'similar_item': '', 'date': '</div>', 'price': '.a-box-inner{background-color:#fff}#alohaBuyBoxWidget .selected{background-color:#fffbf3;border-color:#e77600;box-shadow:0 0 3px rgba(228,121,17,.5)}#alohaBuyBoxWidget .contract-not-available{color:gray}#aloha-cart-popover .aloha-cart{height:auto;overflow:hidden}#aloha-cart-popover #aloha-cartInfo{float:left}#aloha-cart-popover #aloha-cart-details{float:right;margin-top:1em}#aloha-cart-popover .deviceContainer{width:160px;float:left;padding-right:10px;border-right:1px solid #ddd}#aloha-cart-popover li:last-child{border-right:0}#aloha-cart-popover .aloha-device-title{height:3em;overflow:hidden}#aloha-cart-popover .alo

In [555]:
# convert list into pandas dataframe

df = pd.DataFrame.from_dict(data)

print(len(df))
print(df)


pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_rows', 20 )

26790
                                                                                                                                                                                                                                                                                                          category  \
0                                                                                                                                                                                                                                                                                                               []   
1                                                                                                                                                                                                                                                                                                               []   
2                                                               

In [556]:
### load the for "Software" category:

data2 = []
with gzip.open('Dataset/meta_Software.json.gz') as f:
    for l in f:
        data2.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print("Total number of items in the dataset: ", len(data2))


Total number of items in the dataset:  26790


In [557]:
# convert list into pandas dataframe

df2 = pd.DataFrame.from_dict(data2)

# Features of the dataset
df2.columns

# Features are slightly different than 


Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details'],
      dtype='object')

In [558]:
# df2.head()

In [559]:
# Some products have multiple descriptions containing information of for instance the author.
# Perhaps only the first description string should be analyzed for our purpose? This seems to be primarily for the product.
# For now i have just merged all descriptions into a single string for analysis.

# Drop rows with no description (empty list)
df2 = df2[df2['description'].map(lambda d: len(d)) > 0]


In [560]:

df2.description = df2.description.apply(lambda x: [string for string in x if string != ""])
df2.description = df2.description.apply(lambda x: " ".join(x))
pd.set_option('display.max_rows', 20)


In [561]:
# A lot of the descriptions (and other features) contain HTML.
# The function parses and "translates" into plain text descriptions more suitable for analysis

def strip_html(s):
    stop_words = set(stopwords.words('english'))
    stemmer= PorterStemmer()
    if not s or s.isspace(): 
        return ''
    try:
        # remove html tags 
        strr = str(html.fromstring(s).text_content())
        # remove URLs
        strr = re.sub(r"(https|http|href)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b", ' ', strr)
        # remove html hidden carachters 
        strr = strr.replace('\n', ' ').replace('\t', ' ').replace("&nbsp", ' ').replace('\r', ' ')
        # remove punctuation
        strr = re.sub(r'[^\w\s]', ' ', strr)
        # remove numbers
        strr = re.sub(r'\d+', '', strr)
        # lowercase
        strr = strr.lower()
        # remove stop words
        tokens = nltk.word_tokenize(strr)
        strr = [i for i in tokens if not i in stop_words]
        # stemming
        strr = [stemmer.stem(word) for word in strr]
        strr = ' '.join(strr)
        return strr
        # return str(html.fromstring(s).text_content(s))
    except etree.ParserError: # I am not able to find out why the error occur so i continued by catching the exception. Seem to happen on some empty description strings 
        return ''

f = open("descriptionHTMLbefore.txt", "w")
for i in range(5000):
    # print(df_asin_description.iloc[i].description)
    # print(type(df_asin_description.iloc[i].description))
    
    f.write(df2.iloc[i].description)
    # print("similarity: ", df_asin_description.iloc[i].similarity)
    # print("asin: ", df_asin_description.iloc[i].asin)
    # print("")
f.close()

# download stopwords
# nltk.download('stopwords')


print(df2.description.iloc[0:2])
df2.description = df2.description.apply(lambda x: strip_html(x))
print(df2.description.iloc[0:2])

f = open("descriptionHTMLafter.txt", "w")
for i in range(5000):
    # print(df_asin_description.iloc[i].description)
    # print(type(df_asin_description.iloc[i].description))
    
    f.write(df2.iloc[i].description)
    # print("similarity: ", df_asin_description.iloc[i].similarity)
    # print("asin: ", df_asin_description.iloc[i].asin)
    # print("")
f.close()





1    <b>Latin rhythms that will get your kids singing in Spanish</b> <i>Sing, Watch, and Learn Spanish</i> helps your kids ages four through eight take a giant step in learning Spanish by combining two time-honored methods kids have always used to develop their language skills: Imitating other kids a...
2    <b>Connect is the only integrated learning system that empowers students by continuously adapting to deliver precisely what they need, when they need it, how they need it, so that your class time is more engaging and effective.</b><br /> Kelly Cowan just celebrated her 20th anniversary at Miami ...
Name: description, dtype: object
1    latin rhythm get kid sing spanish sing watch learn spanish help kid age four eight take giant step learn spanish combin two time honor method kid alway use develop languag skill imit kid sing along simpl melodi charm dvd contain music video featur kid engag fun activ visit anim zoo compar cloth ...
2    connect integr learn system empow student conti

In [562]:
# input = "yewfvn8934fwejnchlvu8h34;vjcek.bi;/kpofu90[q89y7o2g4uioeprwhttps://dsjkvby8ft7ogy3jkn2puvg4nchlvu8h34;vjcek.bi;/kpofsd"
# print(strip_html(input))

In [563]:
# # stop_words = set(stopwords.words('english'))
# # print(stop_words)
# input = "she likes to go to take coffee every morning while coming back home"
# stemmer= PorterStemmer()
# print(stemmer.stem(input))
# print(stemmer.stem("every"))

# tokens = nltk.word_tokenize(input)
# # strr = [i for i in tokens if not i in stop_words]
# strr = [stemmer.stem(word) for word in tokens]  
# print(strr)

### Does any product contain different descriptions?  

In [564]:
# Counting occurence of unique "asin"
asin_count = df2['asin'].value_counts()
# print(asin_count)
asin_more_than_once = asin_count[asin_count > 1].index
# print(asin_more_than_once)
# Step 2: Filter df2 to keep rows where 'asin' is in asin_more_than_once
filtered_df = df2[df2['asin'].isin(asin_more_than_once)]
filtered_df = filtered_df[["asin","description"]].sort_values(by="asin")


In [565]:
# Visual confirmation of duplicates 
filtered_df

Unnamed: 0,asin,description
877,B00000IV94,set comput game share game featur world famou set game comput version offer ten player opportun play togeth one comput solitair comput object game identifi set three card card uniqu four featur number symbol diamond squiggl oval shade solid stripe open color red green purpl set consist three car...
6028,B00000IV94,set comput game share game featur world famou set game comput version offer ten player opportun play togeth one comput solitair comput object game identifi set three card card uniqu four featur number symbol diamond squiggl oval shade solid stripe open color red green purpl set consist three car...
878,B00000IWIZ,star war episod gungan frontier first ever star war ecolog simul game design player older take place episod time period player must transform barren moon naboo thrive balanc ecosystem help gungan build new underwat bubbl citi mantari amphibi transport player control releas fantast creatur exot p...
6029,B00000IWIZ,star war episod gungan frontier first ever star war ecolog simul game design player older take place episod time period player must transform barren moon naboo thrive balanc ecosystem help gungan build new underwat bubbl citi mantari amphibi transport player control releas fantast creatur exot p...
879,B00000J0GM,magellan gp datasend cd rom data manag softwar transfer point interest data gp point use datasend cd rom magellan gp gp global posit system receiv find north american point interest use product first find inform need datasend cd use pc laptop cd rom drive among point interest avail cd museum amu...
...,...,...
6025,B0005MYHMU,discov histori klondik gold rush juli unit state throe depress peopl begin lose hope north came astonish news gold rush ten thousand peopl caught gold fever set remot yukon area northwest canada join rushth yukon trail begin wharf seattl washington jump point thousand stamped your smart persist ...
6027,B0005MYI4W,maker oregon trail seri storybook weaver delux pack thousand stori start imag stimul creativ write english spanish hour creativ possibl text speech featur let hear stori read aloud educ benefit improv write storytel skill illustr stori hundr imag trigger imagin thousand scene color pattern combi...
11178,B0005MYI4W,maker oregon trail seri storybook weaver delux pack thousand stori start imag stimul creativ write english spanish hour creativ possibl text speech featur let hear stori read aloud educ benefit improv write storytel skill illustr stori hundr imag trigger imagin thousand scene color pattern combi...
6026,B0005MYJ0A,cd softwar teach geographi africa engag enjoy comput learn environ


In [566]:
# If "asin" and "description" match -> drop
filtered_df.drop_duplicates(inplace=True)

# How many unique "asin" ?
len(filtered_df.asin.unique())
filtered_df

Unnamed: 0,asin,description
877,B00000IV94,set comput game share game featur world famou set game comput version offer ten player opportun play togeth one comput solitair comput object game identifi set three card card uniqu four featur number symbol diamond squiggl oval shade solid stripe open color red green purpl set consist three car...
878,B00000IWIZ,star war episod gungan frontier first ever star war ecolog simul game design player older take place episod time period player must transform barren moon naboo thrive balanc ecosystem help gungan build new underwat bubbl citi mantari amphibi transport player control releas fantast creatur exot p...
879,B00000J0GM,magellan gp datasend cd rom data manag softwar transfer point interest data gp point use datasend cd rom magellan gp gp global posit system receiv find north american point interest use product first find inform need datasend cd use pc laptop cd rom drive among point interest avail cd museum amu...
881,B00000JIXG,academ version microsoft offic microsoft offic establish posit effici suit applic document creation commun busi inform analysi mani function busi platform evolv paper web microsoft offic extend desktop product web streamlin way work make easier share access analyz inform get better result offic ...
880,B00000JIXM,profession upgrad includ microsoft word word processor microsoft excel spreadsheet microsoft publish desktop publish microsoft access databas manag microsoft powerpoint present graphic mani featur microsoft offic establish posit effici suit applic document creation commun busi inform analysi man...
...,...,...
6024,B0004N37GW,protect child inappropri emailproduct informationth softwar never young plug toth wire world e commun kidmail safeti perfect way letchildren young year old join onlin convers anim desktop theme featur everyth dinosaur rocket shipskidmail safeti allow youngster dress email theme iconsbackground i...
11174,B0004O05WK,rollercoast tycoon time twister expans pack
11176,B0005MYHMU,discov histori klondik gold rush juli unit state throe depress peopl begin lose hope north came astonish news gold rush ten thousand peopl caught gold fever set remot yukon area northwest canada join rushth yukon trail begin wharf seattl washington jump point thousand stamped your smart persist ...
6027,B0005MYI4W,maker oregon trail seri storybook weaver delux pack thousand stori start imag stimul creativ write english spanish hour creativ possibl text speech featur let hear stori read aloud educ benefit improv write storytel skill illustr stori hundr imag trigger imagin thousand scene color pattern combi...


Removing the duplicates

In [567]:
df_asin_description = df2[["asin","description"]].copy()
df_asin_description.drop_duplicates(inplace=True)
# print(len(df_asin_description))
df_asin_description

Unnamed: 0,asin,description
1,0071480935,latin rhythm get kid sing spanish sing watch learn spanish help kid age four eight take giant step learn spanish combin two time honor method kid alway use develop languag skill imit kid sing along simpl melodi charm dvd contain music video featur kid engag fun activ visit anim zoo compar cloth ...
2,007329506X,connect integr learn system empow student continu adapt deliv precis need need need class time engag effect kelli cowan celebr th anniversari miami univers middletown open admiss campu ohio receiv ph univers louisvil later work univers maryland univers groningen netherland special teach microbio...
4,0073525758,anatomi physiolog reveal cat ultim onlin interact cat dissect experi state art program use cat photo combin layer techniqu allow student peel away layer cat reveal structur beneath surfac anatomi physiolog reveal cat also offer anim histolog radiolog imag audio pronunci comprehens quizz use part...
5,0077340701,john coburn grew hawaiian island seventh sixteen children receiv associ art degre windward commun colleg graduat honor receiv bachelor degre educ univers hawaii lure busi world five year return first love accept teach posit high school mathemat recogn teacher year soon afterward decis made seek ...
6,0077369823,connect integr learn system empow student continu adapt deliv precis need need need class time engag effect kelli cowan celebr th anniversari miami univers middletown open admiss campu ohio receiv ph univers louisvil later work univers maryland univers groningen netherland special teach microbio...
...,...,...
26785,B01HEFZJC2,featur beauti fabul detail real silver necklac truli eleg breathtak design look gorgeou amaz white gold plate design durabl long last easi match suitabl style cloth great detail good person jewelri collect perfect occas anniversari engag parti meet date wed daili wear etc ideal gift girl lover f...
26786,B01HEFZKEE,featur beauti fabul detail real silver necklac truli eleg breathtak design look gorgeou amaz white gold plate design durabl long last easi match suitabl style cloth great detail good person jewelri collect perfect occas anniversari engag parti meet date wed daili wear etc ideal gift girl lover f...
26787,B01HF3G4BS,mac internet secur x contain two best sell secur product protect mac malwar network attack intego virusbarri x intego netbarri x togeth ensur mac protect adwar malwar stranger unknown applic tri get design specif mac provid around clock protect detect divers array threat make sure mac given best...
26788,B01HF41TKI,versacheck x quickbook dna secur compliant us canadian bank requir compat versacheck versaink versaton


Creating shingles

In [568]:
def shingle(aString, q, delimiter=' '):
    """
    Input:
        - aString (str): string to split into shingles
        - q (int)
        - delimiter (str): string of the delimiter to consider to split the input string (default: space)
    Return: list of unique shingles
    """
    all_shingles = []
    if delimiter != '':
        words_list = aString.split(delimiter)
    else:
        words_list = aString
    for i in range (len(words_list)-q+1):
        all_shingles.append(delimiter.join(words_list[i:i+q]))
    return list(set(all_shingles))

In [569]:
# df_asin_description
# Apply shingles to the df_asin_description
df_asin_description["shingles"] = df_asin_description["description"].apply(lambda x: shingle(x, 2))
# df_asin_description

Computing similarity with Jaqquard approach

In [570]:
def similarity(intersection_set, union_set):
    return len(intersection_set)/len(union_set)


In [571]:
input = "Latin Rhythms for Kids is an engaging educational resource designed to facilitate Spanish language learning for children aged four to eight. The program utilizes a combination of two proven methods for language development: imitation and singing. The DVD comprises 16 music videos featuring children participating in enjoyable activities, such as zoo visits and exploring grandma's closet. Each video showcases original songs with authentic Latin rhythms, encouraging kids to sing along. The lyrics, consisting of 300 Spanish words, are displayed on screen as subtitles, providing a visual aid for language absorption. The DVD aims to make language learning entertaining and effective, leveraging the natural affinity children have for singing and imitation. The content is designed to capture children's attention, fostering a fun and immersive environment for language acquisition. Additionally, the program incorporates Connect, an integrated learning system that adapts to students' needs, making the learning process engaging and efficient.Latin Rhythms for Kids is a valuable tool for parents and educators seeking an interactive and enjoyable approach to introducing Spanish language skills to young children. Through a combination of music, visuals, and adaptability, the program aims to create a positive language learning experience."
input = strip_html(input)
input_user = shingle(input, 2)  
# print(input_user)
# print(df_asin_description.shingles.iloc[0])
intersection_jaqquard = set(input_user).intersection(set(df_asin_description.shingles.iloc[0]))
union_jaqquard = set(input_user).union(set(df_asin_description.shingles.iloc[0]))
# print(intersection_jaqquard)
# print(union_jaqquard)
sim = similarity(intersection_jaqquard, union_jaqquard)
print(sim)




0.0903954802259887


In [572]:
# df_asin_description
df_asin_description["similarity"] = df_asin_description["shingles"].apply(lambda x: similarity(set(input_user).intersection(set(x)), set(input_user).union(set(x))))
df_asin_description


Unnamed: 0,asin,description,shingles,similarity
1,0071480935,latin rhythm get kid sing spanish sing watch learn spanish help kid age four eight take giant step learn spanish combin two time honor method kid alway use develop languag skill imit kid sing along simpl melodi charm dvd contain music video featur kid engag fun activ visit anim zoo compar cloth ...,"[step learn, contain music, engag fun, get kid, watch learn, learn spanish, along kid, skill imit, music video, along children, spanish sing, combin two, two time, take giant, use develop, watch listen, spanish combin, method kid, develop languag, giant step, alway use, featur kid, zoo compar, i...",0.090395
2,007329506X,connect integr learn system empow student continu adapt deliv precis need need need class time engag effect kelli cowan celebr th anniversari miami univers middletown open admiss campu ohio receiv ph univers louisvil later work univers maryland univers groningen netherland special teach microbio...,"[effect kelli, student fell, pursu undergradu, continu adapt, situat exclaim, microbiolog class, hear nurs, middletown open, especi pre, deliv precis, nurs dental, encount everyday, hygienist encount, need need, kelli cowan, admiss campu, univers groningen, alli health, engag effect, campu ohio,...",0.015957
4,0073525758,anatomi physiolog reveal cat ultim onlin interact cat dissect experi state art program use cat photo combin layer techniqu allow student peel away layer cat reveal structur beneath surfac anatomi physiolog reveal cat also offer anim histolog radiolog imag audio pronunci comprehens quizz use part...,"[human anatomi, north texa, book jacki, combin layer, audio pronunci, quizz use, also offer, histolog radiolog, experi state, photo combin, onlin interact, alon combin, cat ultim, cours anatomi, ultim onlin, cat reveal, imag audio, offer anim, structur beneath, cat also, semest undergradu, part ...",0.000000
5,0077340701,john coburn grew hawaiian island seventh sixteen children receiv associ art degre windward commun colleg graduat honor receiv bachelor degre educ univers hawaii lure busi world five year return first love accept teach posit high school mathemat recogn teacher year soon afterward decis made seek ...,"[nomin outstand, mathemat educ, year mathemat, megsl made, teacher year, world five, america teacher, fifteen year, oklahoma last, year teach, professor tenur, colleg graduat, come write, associ art, floriss valley, windward commun, coburn grew, athlet game, teach posit, seek master, serv make, ...",0.008368
6,0077369823,connect integr learn system empow student continu adapt deliv precis need need need class time engag effect kelli cowan celebr th anniversari miami univers middletown open admiss campu ohio receiv ph univers louisvil later work univers maryland univers groningen netherland special teach microbio...,"[effect kelli, student fell, pursu undergradu, continu adapt, situat exclaim, microbiolog class, hear nurs, middletown open, especi pre, deliv precis, nurs dental, encount everyday, hygienist encount, need need, kelli cowan, admiss campu, univers groningen, alli health, engag effect, campu ohio,...",0.015957
...,...,...,...,...
26785,B01HEFZJC2,featur beauti fabul detail real silver necklac truli eleg breathtak design look gorgeou amaz white gold plate design durabl long last easi match suitabl style cloth great detail good person jewelri collect perfect occas anniversari engag parti meet date wed daili wear etc ideal gift girl lover f...,"[perfect occas, detail real, engag parti, avoid collis, cloth clean, silver necklac, plate design, collis avoid, design look, alkali corros, friend warm, real silver, corros substanc, long last, avoid surfac, fabul detail, etc pleas, style cloth, date wed, collect perfect, gift girl, girl lover,...",0.000000
26786,B01HEFZKEE,featur beauti fabul detail real silver necklac truli eleg breathtak design look gorgeou amaz white gold plate design durabl long last easi match suitabl style cloth great detail good person jewelri collect perfect occas anniversari engag parti meet date wed daili wear etc ideal gift girl lover f...,"[perfect occas, detail real, engag parti, avoid collis, cloth clean, silver necklac, plate design, collis avoid, design look, alkali corros, friend warm, real silver, corros substanc, long last, avoid surfac, fabul detail, etc pleas, style cloth, date wed, collect perfect, gift girl, girl lover,...",0.000000
26787,B01HF3G4BS,mac internet secur x contain two best sell secur product protect mac malwar network attack intego virusbarri x intego netbarri x togeth ensur mac protect adwar malwar stranger unknown applic tri get design specif mac provid around clock protect detect divers array threat make sure mac given best...,"[threat make, two best, x macintosh, sell secur, internet secur, duoram none, x togeth, make sure, mac os, possibl secur, time mac, gbvideo card, sure mac, mac maverick, network attack, os sierra, given best, os x, mac malwar, get design, none specifiedhard, card none, none specifiedsupport, spe...",0.000000
26788,B01HF41TKI,versacheck x quickbook dna secur compliant us canadian bank requir compat versacheck versaink versaton,"[versacheck versaink, versaink versaton, versacheck x, us canadian, bank requir, x quickbook, quickbook dna, compliant us, requir compat, dna secur, secur compliant, compat versacheck, canadian bank]",0.000000


Dataframe sorted by similarity

In [580]:

df_asin_description.sort_values(by="similarity", ascending=False, inplace=True)
df_asin_description.columns
# df_asin_description[:11].to_csv('10RecommendedItems.csv', index=False)

Index(['asin', 'description', 'shingles', 'similarity'], dtype='object')

In [574]:
# df_asin_description
f = open("demofile2.txt", "a")
for i in range(50):
    # print(df_asin_description.iloc[i].description)
    # print(type(df_asin_description.iloc[i].description))
    
    f.write(df_asin_description.iloc[i].description)
    # print("similarity: ", df_asin_description.iloc[i].similarity)
    # print("asin: ", df_asin_description.iloc[i].asin)
    # print("")
f.close()
print(df_asin_description.similarity.iloc[0:5000])

1        0.090395
22       0.021127
41       0.020548
39       0.018868
1993     0.018605
           ...   
17202    0.000000
26721    0.000000
17203    0.000000
17204    0.000000
17205    0.000000
Name: similarity, Length: 5000, dtype: float64


In [575]:
widgets.Text(value='', disabled=False)

Text(value='')

In [576]:
# Merge description to reviews data using 'asin'

# merged_df = df.merge(df2[['asin', 'description']], on='asin', how='left')