In [409]:
import json
from collections import defaultdict
import gzip
import pandas as pd
from lxml import html,etree
import numpy as np
import ipywidgets as widgets
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# set stopwords vocabulary
nltk.download('stopwords')

# set tokenizer
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ariannabianchi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ariannabianchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [410]:
### load the meta data
data = []
with gzip.open('Dataset/meta_Software.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print("Total number of items in the dataset: ", len(data))

# first row of the list
print(data[0])

Total number of items in the dataset:  26790
{'category': [], 'tech1': '', 'description': [], 'fit': '', 'title': 'HOLT PHYSICS LESSON PRESENTATION CD-ROM QUICK CONCEPTS', 'also_buy': [], 'tech2': '', 'brand': 'HOLT. RINEHART AND WINSTON', 'feature': [], 'rank': '25,550 in Software (', 'also_view': [], 'main_cat': 'Software', 'similar_item': '', 'date': '</div>', 'price': '.a-box-inner{background-color:#fff}#alohaBuyBoxWidget .selected{background-color:#fffbf3;border-color:#e77600;box-shadow:0 0 3px rgba(228,121,17,.5)}#alohaBuyBoxWidget .contract-not-available{color:gray}#aloha-cart-popover .aloha-cart{height:auto;overflow:hidden}#aloha-cart-popover #aloha-cartInfo{float:left}#aloha-cart-popover #aloha-cart-details{float:right;margin-top:1em}#aloha-cart-popover .deviceContainer{width:160px;float:left;padding-right:10px;border-right:1px solid #ddd}#aloha-cart-popover li:last-child{border-right:0}#aloha-cart-popover .aloha-device-title{height:3em;overflow:hidden}#aloha-cart-popover .alo

In [411]:
# convert list into pandas dataframe

df = pd.DataFrame.from_dict(data)

print(len(df))
print(df)


pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_rows', 20 )

26790
                                                                                                                                                                                                                                                                                                          category  \
0                                                                                                                                                                                                                                                                                                               []   
1                                                                                                                                                                                                                                                                                                               []   
2                                                               

In [412]:
### load the for "Software" category:

data2 = []
with gzip.open('Dataset/meta_Software.json.gz') as f:
    for l in f:
        data2.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print("Total number of items in the dataset: ", len(data2))


Total number of items in the dataset:  26790


In [413]:
# convert list into pandas dataframe

df2 = pd.DataFrame.from_dict(data2)

# Features of the dataset
df2.columns

# Features are slightly different than 


Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details'],
      dtype='object')

In [414]:
# df2.head()

In [415]:
# Some products have multiple descriptions containing information of for instance the author.
# Perhaps only the first description string should be analyzed for our purpose? This seems to be primarily for the product.
# For now i have just merged all descriptions into a single string for analysis.

#df2.description = df2.description.apply(lambda x: "".join(x))

# Drop rows with no description (empty list)
df2 = df2[df2['description'].map(lambda d: len(d)) > 0]



# df2.description


In [416]:

df2.description = df2.description.apply(lambda x: [string for string in x if string != ""])


In [417]:
# for i in range(3):

#     print(df2.iloc[i].description[0])

In [418]:
df2.description = df2.description.apply(lambda x: " ".join(x))


In [419]:
df2.loc[1:4]
pd.set_option('display.max_rows', 20)

In [420]:
# df2.loc[1:4]

In [421]:
# A lot of the descriptions (and other features) contain HTML.
# The function parses and "translates" into plain text descriptions more suitable for analysis

def strip_html(s):
    stop_words = set(stopwords.words('english'))
    
    if not s or s.isspace(): 
        return ''
    try:
        # remove html tags 
        strr = str(html.fromstring(s).text_content())
        # remove html hidden carachters 
        strr = strr.replace('\n', ' ').replace('\t', ' ').replace("&nbsp", ' ').replace('\r', ' ')
        # remove punctuation
        strr = re.sub(r'[^\w\s]', ' ', strr)
        # remove numbers
        strr = re.sub(r'\d+', '', strr)
        # lowercase
        strr = strr.lower()
        # remove stop words
        tokens = nltk.word_tokenize(strr)
        strr = [i for i in tokens if not i in stop_words]
        strr = ' '.join(strr)
        return strr
        # return str(html.fromstring(s).text_content(s))
    except etree.ParserError: # I am not able to find out why the error occur so i continued by catching the exception. Seem to happen on some empty description strings 
        return ''

f = open("descriptionHTMLbefore.txt", "w")
for i in range(5000):
    # print(df_asin_description.iloc[i].description)
    # print(type(df_asin_description.iloc[i].description))
    
    f.write(df2.iloc[i].description)
    # print("similarity: ", df_asin_description.iloc[i].similarity)
    # print("asin: ", df_asin_description.iloc[i].asin)
    # print("")
f.close()

# download stopwords
# nltk.download('stopwords')


print(df2.description.iloc[0:2])
df2.description = df2.description.apply(lambda x: strip_html(x))
print(df2.description.iloc[0:2])

f = open("descriptionHTMLafter.txt", "w")
for i in range(5000):
    # print(df_asin_description.iloc[i].description)
    # print(type(df_asin_description.iloc[i].description))
    
    f.write(df2.iloc[i].description)
    # print("similarity: ", df_asin_description.iloc[i].similarity)
    # print("asin: ", df_asin_description.iloc[i].asin)
    # print("")
f.close()





1    <b>Latin rhythms that will get your kids singing in Spanish</b> <i>Sing, Watch, and Learn Spanish</i> helps your kids ages four through eight take a giant step in learning Spanish by combining two time-honored methods kids have always used to develop their language skills: Imitating other kids a...
2    <b>Connect is the only integrated learning system that empowers students by continuously adapting to deliver precisely what they need, when they need it, how they need it, so that your class time is more engaging and effective.</b><br /> Kelly Cowan just celebrated her 20th anniversary at Miami ...
Name: description, dtype: object
1    latin rhythms get kids singing spanish sing watch learn spanish helps kids ages four eight take giant step learning spanish combining two time honored methods kids always used develop language skills imitating kids singing along simple melodies charming dvd contains music videos featuring kids e...
2    connect integrated learning system empowers stu

In [422]:
# stop_words = set(stopwords.words('english'))
# print(stop_words)

In [423]:
# len(np.where(df2.asin.duplicated())[0])

### Does any product contain different descriptions?  

In [424]:
# Counting occurence of unique "asin"
asin_count = df2['asin'].value_counts()
# print(asin_count)
asin_more_than_once = asin_count[asin_count > 1].index
# print(asin_more_than_once)
# Step 2: Filter df2 to keep rows where 'asin' is in asin_more_than_once
filtered_df = df2[df2['asin'].isin(asin_more_than_once)]
filtered_df = filtered_df[["asin","description"]].sort_values(by="asin")


In [425]:
# Visual confirmation of duplicates 
filtered_df

Unnamed: 0,asin,description
877,B00000IV94,set computer game shares game features world famous set game computer version offers ten players opportunity play together one computer solitaire computer object game identify sets three cards card unique four features number symbol diamond squiggle oval shading solid striped open color red gree...
6028,B00000IV94,set computer game shares game features world famous set game computer version offers ten players opportunity play together one computer solitaire computer object game identify sets three cards card unique four features number symbol diamond squiggle oval shading solid striped open color red gree...
878,B00000IWIZ,star wars episode gungan frontier first ever star wars ecological simulation game designed players older taking place episode time period players must transform barren moon naboo thriving balanced ecosystem help gungans build new underwater bubble city mantaris amphibious transport player contro...
6029,B00000IWIZ,star wars episode gungan frontier first ever star wars ecological simulation game designed players older taking place episode time period players must transform barren moon naboo thriving balanced ecosystem help gungans build new underwater bubble city mantaris amphibious transport player contro...
879,B00000J0GM,magellan gps datasend cd rom data management software transfers points interest data gps point use datasend cd rom magellan gps gps global positioning system receiver find north american points interest use product first find information need datasend cd using pc laptop cd rom drive among points...
...,...,...
6025,B0005MYHMU,discover history klondike gold rush july united states throes depression people beginning lose hope north came astonishing news gold rush tens thousands people caught gold fever set remote yukon area northwest canada join rushthe yukon trail begins wharf seattle washington jumping point thousand...
6027,B0005MYI4W,makers oregon trail series storybook weaver deluxe packed thousands story starting images stimulate creativity write english spanish hours creative possibilities text speech feature lets hear story read aloud educational benefits improve writing storytelling skills illustrate stories hundreds im...
11178,B0005MYI4W,makers oregon trail series storybook weaver deluxe packed thousands story starting images stimulate creativity write english spanish hours creative possibilities text speech feature lets hear story read aloud educational benefits improve writing storytelling skills illustrate stories hundreds im...
6026,B0005MYJ0A,cd software teaches geography africa engaging enjoyable computer learning environment


In [426]:
# If "asin" and "description" match -> drop
filtered_df.drop_duplicates(inplace=True)

# How many unique "asin" ?
len(filtered_df.asin.unique())
filtered_df

Unnamed: 0,asin,description
877,B00000IV94,set computer game shares game features world famous set game computer version offers ten players opportunity play together one computer solitaire computer object game identify sets three cards card unique four features number symbol diamond squiggle oval shading solid striped open color red gree...
878,B00000IWIZ,star wars episode gungan frontier first ever star wars ecological simulation game designed players older taking place episode time period players must transform barren moon naboo thriving balanced ecosystem help gungans build new underwater bubble city mantaris amphibious transport player contro...
879,B00000J0GM,magellan gps datasend cd rom data management software transfers points interest data gps point use datasend cd rom magellan gps gps global positioning system receiver find north american points interest use product first find information need datasend cd using pc laptop cd rom drive among points...
881,B00000JIXG,academic version microsoft office microsoft office established position efficient suite applications document creation communication business information analysis many functions business platform evolved paper web microsoft office extends desktop productivity web streamlining way work making eas...
880,B00000JIXM,professional upgrade includes microsoft word word processor microsoft excel spreadsheets microsoft publisher desktop publishing microsoft access database management microsoft powerpoint presentation graphics many features microsoft office established position efficient suite applications documen...
...,...,...
6024,B0004N37GW,protect child inappropriate emailproduct informationthe software never young plug tothe wired world e communication kidmail safety perfect way letchildren young years old join online conversation animated desktop themes featuring everything dinosaurs rocket shipskidmail safety allows youngsters ...
11174,B0004O05WK,rollercoaster tycoon time twister expansion pack
11176,B0005MYHMU,discover history klondike gold rush july united states throes depression people beginning lose hope north came astonishing news gold rush tens thousands people caught gold fever set remote yukon area northwest canada join rushthe yukon trail begins wharf seattle washington jumping point thousand...
6027,B0005MYI4W,makers oregon trail series storybook weaver deluxe packed thousands story starting images stimulate creativity write english spanish hours creative possibilities text speech feature lets hear story read aloud educational benefits improve writing storytelling skills illustrate stories hundreds im...


Removing the duplicates

In [427]:
df_asin_description = df2[["asin","description"]].copy()
df_asin_description.drop_duplicates(inplace=True)
# print(len(df_asin_description))
df_asin_description

Unnamed: 0,asin,description
1,0071480935,latin rhythms get kids singing spanish sing watch learn spanish helps kids ages four eight take giant step learning spanish combining two time honored methods kids always used develop language skills imitating kids singing along simple melodies charming dvd contains music videos featuring kids e...
2,007329506X,connect integrated learning system empowers students continuously adapting deliver precisely need need need class time engaging effective kelly cowan celebrated th anniversary miami university middletown open admissions campus ohio received ph university louisville later worked university maryla...
4,0073525758,anatomy physiology revealed cat ultimate online interactive cat dissection experience state art program uses cat photos combined layering technique allows student peel away layers cat reveal structures beneath surface anatomy physiology revealed cat also offers animations histologic radiologic i...
5,0077340701,john coburn grew hawaiian islands seventh sixteen children received associate arts degree windward community college graduated honors received bachelors degree education university hawaii lured business world five years returned first love accepting teaching position high school mathematics reco...
6,0077369823,connect integrated learning system empowers students continuously adapting deliver precisely need need need class time engaging effective kelly cowan celebrated th anniversary miami university middletown open admissions campus ohio received ph university louisville later worked university maryla...
...,...,...
26785,B01HEFZJC2,features beautiful fabulously detailed real silver necklace truly elegant breathtaking design looks gorgeous amazing white gold plated design durable long lasting easy match suitable style clothes great detail good personal jewelry collection perfect occasions anniversary engagement party meetin...
26786,B01HEFZKEE,features beautiful fabulously detailed real silver necklace truly elegant breathtaking design looks gorgeous amazing white gold plated design durable long lasting easy match suitable style clothes great detail good personal jewelry collection perfect occasions anniversary engagement party meetin...
26787,B01HF3G4BS,mac internet security x contains two best selling security products protect mac malware network attacks intego virusbarrier x intego netbarrier x together ensure mac protected adware malware strangers unknown applications trying get designed specifically mac provide around clock protection detec...
26788,B01HF41TKI,versacheck x quickbooks dna secure compliant us canadian bank requirements compatible versacheck versaink versatoner


Creating shingles

In [428]:
def shingle(aString, q, delimiter=' '):
    """
    Input:
        - aString (str): string to split into shingles
        - q (int)
        - delimiter (str): string of the delimiter to consider to split the input string (default: space)
    Return: list of unique shingles
    """
    all_shingles = []
    if delimiter != '':
        words_list = aString.split(delimiter)
    else:
        words_list = aString
    for i in range (len(words_list)-q+1):
        all_shingles.append(delimiter.join(words_list[i:i+q]))
    return list(set(all_shingles))

In [429]:
# df_asin_description
# Apply shingles to the df_asin_description
df_asin_description["shingles"] = df_asin_description["description"].apply(lambda x: shingle(x, 2))
# df_asin_description

Computing similarity with Jaqquard approach

In [430]:
def similarity(intersection_set, union_set):
    return len(intersection_set)/len(union_set)


In [431]:
input = "Latin Rhythms for Kids is an engaging educational resource designed to facilitate Spanish language learning for children aged four to eight. The program utilizes a combination of two proven methods for language development: imitation and singing. The DVD comprises 16 music videos featuring children participating in enjoyable activities, such as zoo visits and exploring grandma's closet. Each video showcases original songs with authentic Latin rhythms, encouraging kids to sing along. The lyrics, consisting of 300 Spanish words, are displayed on screen as subtitles, providing a visual aid for language absorption. The DVD aims to make language learning entertaining and effective, leveraging the natural affinity children have for singing and imitation. The content is designed to capture children's attention, fostering a fun and immersive environment for language acquisition. Additionally, the program incorporates Connect, an integrated learning system that adapts to students' needs, making the learning process engaging and efficient.Latin Rhythms for Kids is a valuable tool for parents and educators seeking an interactive and enjoyable approach to introducing Spanish language skills to young children. Through a combination of music, visuals, and adaptability, the program aims to create a positive language learning experience."
input_user = shingle(input, 2)  
# print(input_user)
# print(df_asin_description.shingles.iloc[0])
intersection_jaqquard = set(input_user).intersection(set(df_asin_description.shingles.iloc[0]))
union_jaqquard = set(input_user).union(set(df_asin_description.shingles.iloc[0]))
# print(intersection_jaqquard)
# print(union_jaqquard)
sim = similarity(intersection_jaqquard, union_jaqquard)
print(sim)




0.012048192771084338


In [432]:
# df_asin_description
df_asin_description["similarity"] = df_asin_description["shingles"].apply(lambda x: similarity(set(input_user).intersection(set(x)), set(input_user).union(set(x))))
df_asin_description


Unnamed: 0,asin,description,shingles,similarity
1,0071480935,latin rhythms get kids singing spanish sing watch learn spanish helps kids ages four eight take giant step learning spanish combining two time honored methods kids always used develop language skills imitating kids singing along simple melodies charming dvd contains music videos featuring kids e...,"[along simple, kids absorb, watch learn, always used, latin rhythms, learn spanish, activities visiting, sizes grandmas, comparing clothing, animals zoo, original song, get kids, along children, spanish sing, two time, kids always, take giant, imitating kids, visiting animals, music videos, watc...",0.012048
2,007329506X,connect integrated learning system empowers students continuously adapting deliver precisely need need need class time engaging effective kelly cowan celebrated th anniversary miami university middletown open admissions campus ohio received ph university louisville later worked university maryla...,"[students continuously, university louisville, engaging effective, integrated learning, hygiene made, allied health, university maryland, love microbiology, dental hygiene, situations exclaim, received ph, open admissions, health students, everyday situations, university groningen, need need, en...",0.008197
4,0073525758,anatomy physiology revealed cat ultimate online interactive cat dissection experience state art program uses cat photos combined layering technique allows student peel away layers cat reveal structures beneath surface anatomy physiology revealed cat also offers animations histologic radiologic i...,"[chair department, physiology human, revealed cat, also offers, science grayson, program uses, north texas, cat available, cat reveal, combined mcgraw, quizzing used, uses cat, pronunciations comprehensive, surface anatomy, ultimate online, jackie butler, cat also, experience state, two semester...",0.000000
5,0077340701,john coburn grew hawaiian islands seventh sixteen children received associate arts degree windward community college graduated honors received bachelors degree education university hawaii lured business world five years returned first love accepting teaching position high school mathematics reco...,"[arts degree, community college, graduated honors, megsl made, made numerous, teacher year, years later, world five, variety topics, hawaii lured, position high, nominations whos, oklahoma last, college graduated, mathematics educators, professor tenure, hawaiian islands, coburn grew, beautiful ...",0.000000
6,0077369823,connect integrated learning system empowers students continuously adapting deliver precisely need need need class time engaging effective kelly cowan celebrated th anniversary miami university middletown open admissions campus ohio received ph university louisville later worked university maryla...,"[students continuously, university louisville, engaging effective, integrated learning, hygiene made, allied health, university maryland, love microbiology, dental hygiene, situations exclaim, received ph, open admissions, health students, everyday situations, university groningen, need need, en...",0.008197
...,...,...,...,...
26785,B01HEFZJC2,features beautiful fabulously detailed real silver necklace truly elegant breathtaking design looks gorgeous amazing white gold plated design durable long lasting easy match suitable style clothes great detail good personal jewelry collection perfect occasions anniversary engagement party meetin...,"[long lasting, warm tips, cloth clean, party meeting, friend warm, fabulously detailed, real silver, lasting easy, silver necklace, wearing sweating, suitable style, sweating lot, corrosive substances, clothes great, gift girl, substances please, dating wedding, amazing white, avoid collision, a...",0.000000
26786,B01HEFZKEE,features beautiful fabulously detailed real silver necklace truly elegant breathtaking design looks gorgeous amazing white gold plated design durable long lasting easy match suitable style clothes great detail good personal jewelry collection perfect occasions anniversary engagement party meetin...,"[long lasting, warm tips, cloth clean, party meeting, friend warm, fabulously detailed, real silver, lasting easy, silver necklace, wearing sweating, suitable style, sweating lot, corrosive substances, clothes great, gift girl, substances please, dating wedding, amazing white, avoid collision, a...",0.000000
26787,B01HF3G4BS,mac internet security x contains two best selling security products protect mac malware network attacks intego virusbarrier x intego netbarrier x together ensure mac protected adware malware strangers unknown applications trying get designed specifically mac provide around clock protection detec...,"[two best, x macintosh, requirements processor, provide around, duoram none, make sure, mac os, get designed, yosemite mac, contains two, gbvideo card, specifiedsupported os, sure mac, os sierra, mac provide, mac protected, mac mavericks, given best, ensure mac, requirements mac, security x, os ...",0.000000
26788,B01HF41TKI,versacheck x quickbooks dna secure compliant us canadian bank requirements compatible versacheck versaink versatoner,"[secure compliant, versacheck versaink, dna secure, versacheck x, us canadian, compliant us, bank requirements, quickbooks dna, compatible versacheck, x quickbooks, requirements compatible, versaink versatoner, canadian bank]",0.000000


Dataframe sorted by similarity

In [433]:

df_asin_description.sort_values(by="similarity", ascending=False, inplace=True)
df_asin_description

Unnamed: 0,asin,description,shingles,similarity
1,0071480935,latin rhythms get kids singing spanish sing watch learn spanish helps kids ages four eight take giant step learning spanish combining two time honored methods kids always used develop language skills imitating kids singing along simple melodies charming dvd contains music videos featuring kids e...,"[along simple, kids absorb, watch learn, always used, latin rhythms, learn spanish, activities visiting, sizes grandmas, comparing clothing, animals zoo, original song, get kids, along children, spanish sing, two time, kids always, take giant, imitating kids, visiting animals, music videos, watc...",0.012048
14107,B000IVFTIE,meetings mito holiday hakone kobe kyoto join conversation instant immersion japanese dynamic audio language learning system compact discs,"[hakone kobe, kobe kyoto, learning system, conversation instant, join conversation, language learning, kyoto join, instant immersion, dynamic audio, mito holiday, compact discs, immersion japanese, meetings mito, audio language, holiday hakone, system compact, japanese dynamic]",0.010471
4623,B0000AZVVX,award winning learning system improved visual interactive tool endorsed professional home classroom use four cd american sign language learning system includes,"[award winning, system includes, endorsed professional, professional home, visual interactive, language learning, tool endorsed, use four, learning system, improved visual, winning learning, american sign, cd american, classroom use, system improved, four cd, home classroom, sign language, inter...",0.010363
22,0077510585,connect integrated learning system empowers students continuously adapting deliver precisely need need need class time engaging effective linda sherwood member department microbiology montana state university,"[students continuously, member department, department microbiology, microbiology montana, effective linda, engaging effective, integrated learning, need need, empowers students, deliver precisely, time engaging, system empowers, connect integrated, precisely need, montana state, learning system,...",0.010152
15873,B0011DO1KU,learn language minutes day communicate ease enjoy traveling fun use dynamic language learning system interactive games user friendly skill building exercises bring speed allowing join conversation,"[fun use, user friendly, language learning, bring speed, friendly skill, exercises bring, enjoy traveling, games user, allowing join, ease enjoy, building exercises, communicate ease, speed allowing, learning system, learn language, system interactive, dynamic language, day communicate, skill bu...",0.010050
...,...,...,...,...
12780,B000BP77LE,master spelling skills st nd rdgradesproduct informationnab book thieves join blasternaut galactic commander andspot exciting adventure trio vacationing planet islandiawhen suddenly plans interrupted mumblers planet nonsense aretaking books planet treasury help blasterpals explore theislands nav...,"[space x, ii mhz, help blasterpals, x mb, mb os, higher mb, planet treasury, treasury help, words complete, unscramble letters, cd rom, make words, mac os, windows compatible, free hard, spelling skills, mumbler taken, use phonics, compatible sound, windows xp, rom drive, word editor, hard disk,...",0.000000
12781,B000BP8ZHE,world order great todd rundgren cd choose may make alterations changes music,"[great todd, todd rundgren, alterations changes, may make, order great, choose may, rundgren cd, cd choose, make alterations, world order, changes music]",0.000000
12782,B000BPAAS6,mastering quickbooks premier contractor edition,"[contractor edition, mastering quickbooks, premier contractor, quickbooks premier]",0.000000
12783,B000BP8ALK,punch home design architectural series combines top landscape design interior design software one powerful package punch home design architectural series combines power master landscape pro version strength interior design suite version bring versatile home landscape design software ever one sin...,"[drop functions, show creation, designs snap, design software, controls deliver, built home, ever one, precision interior, combines top, range powerful, fingertips beginners, texture color, program beginner, drag drop, power master, design interior, limited imagination, multiple shadows, top not...",0.000000


In [434]:
# df_asin_description
f = open("demofile2.txt", "a")
for i in range(50):
    # print(df_asin_description.iloc[i].description)
    # print(type(df_asin_description.iloc[i].description))
    
    f.write(df_asin_description.iloc[i].description)
    # print("similarity: ", df_asin_description.iloc[i].similarity)
    # print("asin: ", df_asin_description.iloc[i].asin)
    # print("")
f.close()
print(df_asin_description.similarity.iloc[0:5000])

1        0.012048
14107    0.010471
4623     0.010363
22       0.010152
15873    0.010050
           ...   
18462    0.000000
18461    0.000000
18459    0.000000
18487    0.000000
18458    0.000000
Name: similarity, Length: 5000, dtype: float64


In [435]:
widgets.Text(value='', disabled=False)

Text(value='')

In [436]:
# Merge description to reviews data using 'asin'

# merged_df = df.merge(df2[['asin', 'description']], on='asin', how='left')