# Similar Items System
Program that reads the dataset, preprocess the data and output the most similar items based on a user description of a product.

In [45]:
import json
from collections import defaultdict
import gzip
import pandas as pd
from lxml import html,etree
import numpy as np
import ipywidgets as widgets
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import os

# set stopwords vocabulary
nltk.download('stopwords')

# set tokenizer
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ariannabianchi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ariannabianchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [46]:
# load the meta data
data = []
with gzip.open('Dataset/meta_Software.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))

# with gzip.open('Dataset/meta_CDs_and_Vinyl.json.gz') as f:
#     for l in f:
#         data.append(json.loads(l.strip()))

print("Total number of items in the dataset: ", len(data))

Total number of items in the dataset:  26790


In [47]:
# convert list into pandas dataframe
df2 = pd.DataFrame.from_dict(data)

# set size of display in pandas
pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_rows', 20 )

# first row of the list
print("Columns of the dataset: ", df2.columns)

# show dataframe with columns and rows
# df.head()
# df2.info()


Columns of the dataset:  Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details'],
      dtype='object')


### Preprocess of the data

- Remove empty description
- Remove HTML tag
- Remove URLs
- Remove HTML hidden carachters
- Remove punctuation
- Remove numbers
- Transform every word into lowercase
- Remove stop words
- Perform stemming 

In [48]:
# Drop rows with no description (description is empty)
df2 = df2[df2['description'].map(lambda d: len(d)) > 0]
df2.description
# df2.head()


1        [, <b>Latin rhythms that will get your kids singing in Spanish</b>, <i>Sing, Watch, and Learn Spanish</i> helps your kids ages four through eight take a giant step in learning Spanish by combining two time-honored methods kids have always used to develop their language skills: Imitating other ki...
2        [<b>Connect is the only integrated learning system that empowers students by continuously adapting to deliver precisely what they need, when they need it, how they need it, so that your class time is more engaging and effective.</b><br />, Kelly Cowan just celebrated her 20th anniversary at Miam...
4        [<i>Anatomy &amp; Physiology Revealed Cat</i> is the ultimate online interactive cat dissection experience. This state-of-the-art program uses cat photos combined with a layering technique that allows the student to peel away layers of the cat to reveal structures beneath the surface. <i>Anatomy...
5        [John Coburn grew up in the Hawaiian Islands, the seventh of six

In [49]:
# each description is a list of strings,we want to remove the empty strings, and join the list of strings into one string
df2.description = df2.description.apply(lambda x: [string for string in x if string != ""])
df2.description = df2.description.apply(lambda x: " ".join(x))
df2.iloc[0].description


'<b>Latin rhythms that will get your kids singing in Spanish</b> <i>Sing, Watch, and Learn Spanish</i> helps your kids ages four through eight take a giant step in learning Spanish by combining two time-honored methods kids have always used to develop their language skills: Imitating other kids and singing along with simple melodies. This charming DVD contains 16 music videos featuring kids engaged in fun activities, from visiting animals at the zoo to comparing clothing sizes in grandmas closet. Each video features an original song of authentic Latin rhythms that gets kids singing along with the children on screen. As they watch, listen, and sing along, kids absorb 300 Spanish words, each of which is shouted out in a song and displayed as a subtitle on screen.'

In [50]:
# A lot of the descriptions (and other features) contain HTML.
# The function parses and "translates" into plain text descriptions more suitable for analysis.

def preprocess_data(s):
    stop_words = set(stopwords.words('english'))
    stemmer= PorterStemmer()
    if not s or s.isspace(): 
        return ''
    try:
        # remove html tags 
        strr = str(html.fromstring(s).text_content())
        # remove URLs
        strr = re.sub(r"(https|http|href)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b", ' ', strr)
        # remove html hidden carachters 
        strr = strr.replace('\n', ' ').replace('\t', ' ').replace("&nbsp", ' ').replace('\r', ' ')
        # remove punctuation
        strr = re.sub(r'[^\w\s]', ' ', strr)
        # remove numbers
        strr = re.sub(r'\d+', '', strr)
        # lowercase
        strr = strr.lower()
        # remove stop words
        tokens = nltk.word_tokenize(strr)
        strr = [i for i in tokens if not i in stop_words]
        # stemming
        strr = [stemmer.stem(word) for word in strr]
        strr = ' '.join(strr)
        return strr 
    except etree.ParserError: 
        return ''

# f = open("descriptionHTMLbefore.txt", "w")
# for i in range(5000):
#     f.write(df2.iloc[i].description)
# f.close()

print("Example of description before preprocessing: ")
print(df2.description.iloc[0:2])
df2.description = df2.description.apply(lambda x: preprocess_data(x))
print()
print("Example of description after preprocessing: ")
print(df2.description.iloc[0:2])

# f = open("descriptionHTMLafter.txt", "w")
# for i in range(5000):
#     f.write(df2.iloc[i].description)
# f.close()





Example of description before preprocessing: 
1    <b>Latin rhythms that will get your kids singing in Spanish</b> <i>Sing, Watch, and Learn Spanish</i> helps your kids ages four through eight take a giant step in learning Spanish by combining two time-honored methods kids have always used to develop their language skills: Imitating other kids a...
2    <b>Connect is the only integrated learning system that empowers students by continuously adapting to deliver precisely what they need, when they need it, how they need it, so that your class time is more engaging and effective.</b><br /> Kelly Cowan just celebrated her 20th anniversary at Miami ...
Name: description, dtype: object

Example of description after preprocessing: 
1    latin rhythm get kid sing spanish sing watch learn spanish help kid age four eight take giant step learn spanish combin two time honor method kid alway use develop languag skill imit kid sing along simpl melodi charm dvd contain music video featur kid engag fu

In [51]:
# input = "yewfvn8934fwejnchlvu8h34;vjcek.bi;/kpofu90[q89y7o2g4uioeprwhttps://dsjkvby8ft7ogy3jkn2puvg4nchlvu8h34;vjcek.bi;/kpofsd"
# print(preprocess_data(input))

In [52]:
# # stop_words = set(stopwords.words('english'))
# # print(stop_words)
# input = "she likes to go to take coffee every morning while coming back home"
# stemmer= PorterStemmer()
# print(stemmer.stem(input))
# print(stemmer.stem("every"))

# tokens = nltk.word_tokenize(input)
# # strr = [i for i in tokens if not i in stop_words]
# strr = [stemmer.stem(word) for word in tokens]  
# print(strr)

### Does any product contain different descriptions?  
There exists products which are not unique. The asin and the descriptions are duplicated. 
We process the data in order to have unique products.

In [53]:
# Counting occurence of unique "asin"
asin_count = df2['asin'].value_counts()
# print(asin_count)
asin_more_than_once = asin_count[asin_count > 1].index
# print(asin_more_than_once)
# Step 2: Filter df2 to keep rows where 'asin' is in asin_more_than_once
filtered_df = df2[df2['asin'].isin(asin_more_than_once)]
filtered_df = filtered_df[["asin","description"]].sort_values(by="asin")
# Visual confirmation of duplicates 
filtered_df

Unnamed: 0,asin,description
877,B00000IV94,set comput game share game featur world famou set game comput version offer ten player opportun play togeth one comput solitair comput object game identifi set three card card uniqu four featur number symbol diamond squiggl oval shade solid stripe open color red green purpl set consist three car...
6028,B00000IV94,set comput game share game featur world famou set game comput version offer ten player opportun play togeth one comput solitair comput object game identifi set three card card uniqu four featur number symbol diamond squiggl oval shade solid stripe open color red green purpl set consist three car...
878,B00000IWIZ,star war episod gungan frontier first ever star war ecolog simul game design player older take place episod time period player must transform barren moon naboo thrive balanc ecosystem help gungan build new underwat bubbl citi mantari amphibi transport player control releas fantast creatur exot p...
6029,B00000IWIZ,star war episod gungan frontier first ever star war ecolog simul game design player older take place episod time period player must transform barren moon naboo thrive balanc ecosystem help gungan build new underwat bubbl citi mantari amphibi transport player control releas fantast creatur exot p...
879,B00000J0GM,magellan gp datasend cd rom data manag softwar transfer point interest data gp point use datasend cd rom magellan gp gp global posit system receiv find north american point interest use product first find inform need datasend cd use pc laptop cd rom drive among point interest avail cd museum amu...
...,...,...
6025,B0005MYHMU,discov histori klondik gold rush juli unit state throe depress peopl begin lose hope north came astonish news gold rush ten thousand peopl caught gold fever set remot yukon area northwest canada join rushth yukon trail begin wharf seattl washington jump point thousand stamped your smart persist ...
6027,B0005MYI4W,maker oregon trail seri storybook weaver delux pack thousand stori start imag stimul creativ write english spanish hour creativ possibl text speech featur let hear stori read aloud educ benefit improv write storytel skill illustr stori hundr imag trigger imagin thousand scene color pattern combi...
11178,B0005MYI4W,maker oregon trail seri storybook weaver delux pack thousand stori start imag stimul creativ write english spanish hour creativ possibl text speech featur let hear stori read aloud educ benefit improv write storytel skill illustr stori hundr imag trigger imagin thousand scene color pattern combi...
6026,B0005MYJ0A,cd softwar teach geographi africa engag enjoy comput learn environ


In [54]:
# If "asin" and "description" match -> drop
filtered_df.drop_duplicates(inplace=True)

# How many unique "asin" ?
len(filtered_df.asin.unique())
filtered_df

Unnamed: 0,asin,description
877,B00000IV94,set comput game share game featur world famou set game comput version offer ten player opportun play togeth one comput solitair comput object game identifi set three card card uniqu four featur number symbol diamond squiggl oval shade solid stripe open color red green purpl set consist three car...
878,B00000IWIZ,star war episod gungan frontier first ever star war ecolog simul game design player older take place episod time period player must transform barren moon naboo thrive balanc ecosystem help gungan build new underwat bubbl citi mantari amphibi transport player control releas fantast creatur exot p...
879,B00000J0GM,magellan gp datasend cd rom data manag softwar transfer point interest data gp point use datasend cd rom magellan gp gp global posit system receiv find north american point interest use product first find inform need datasend cd use pc laptop cd rom drive among point interest avail cd museum amu...
881,B00000JIXG,academ version microsoft offic microsoft offic establish posit effici suit applic document creation commun busi inform analysi mani function busi platform evolv paper web microsoft offic extend desktop product web streamlin way work make easier share access analyz inform get better result offic ...
880,B00000JIXM,profession upgrad includ microsoft word word processor microsoft excel spreadsheet microsoft publish desktop publish microsoft access databas manag microsoft powerpoint present graphic mani featur microsoft offic establish posit effici suit applic document creation commun busi inform analysi man...
...,...,...
6024,B0004N37GW,protect child inappropri emailproduct informationth softwar never young plug toth wire world e commun kidmail safeti perfect way letchildren young year old join onlin convers anim desktop theme featur everyth dinosaur rocket shipskidmail safeti allow youngster dress email theme iconsbackground i...
11174,B0004O05WK,rollercoast tycoon time twister expans pack
11176,B0005MYHMU,discov histori klondik gold rush juli unit state throe depress peopl begin lose hope north came astonish news gold rush ten thousand peopl caught gold fever set remot yukon area northwest canada join rushth yukon trail begin wharf seattl washington jump point thousand stamped your smart persist ...
6027,B0005MYI4W,maker oregon trail seri storybook weaver delux pack thousand stori start imag stimul creativ write english spanish hour creativ possibl text speech featur let hear stori read aloud educ benefit improv write storytel skill illustr stori hundr imag trigger imagin thousand scene color pattern combi...


Removing the duplicates products -> now each product is unique

In [55]:
df_asin_description = df2[["asin","description"]].copy()
df_asin_description.drop_duplicates(subset = "description", inplace=True)
# print(len(df_asin_description))
df_asin_description

Unnamed: 0,asin,description
1,0071480935,latin rhythm get kid sing spanish sing watch learn spanish help kid age four eight take giant step learn spanish combin two time honor method kid alway use develop languag skill imit kid sing along simpl melodi charm dvd contain music video featur kid engag fun activ visit anim zoo compar cloth ...
2,007329506X,connect integr learn system empow student continu adapt deliv precis need need need class time engag effect kelli cowan celebr th anniversari miami univers middletown open admiss campu ohio receiv ph univers louisvil later work univers maryland univers groningen netherland special teach microbio...
4,0073525758,anatomi physiolog reveal cat ultim onlin interact cat dissect experi state art program use cat photo combin layer techniqu allow student peel away layer cat reveal structur beneath surfac anatomi physiolog reveal cat also offer anim histolog radiolog imag audio pronunci comprehens quizz use part...
5,0077340701,john coburn grew hawaiian island seventh sixteen children receiv associ art degre windward commun colleg graduat honor receiv bachelor degre educ univers hawaii lure busi world five year return first love accept teach posit high school mathemat recogn teacher year soon afterward decis made seek ...
7,0077410297,live art approach art appreci support student acquisit essenti skill cours mark getlein vivid narr concert mcgraw hill power adapt learn program learnsmart connect art help student understand analyz appreci way art work commun us visual world live art provid foundat life long appreci art critic ...
...,...,...
26784,B01HD1CQPK,note pleas compar detail size buy use similar cloth compar size size detail size bust cm shoulder cm sleev cm length cm size bust cm shoulder cm sleev cm length cm size l bust cm shoulder cm sleev cm length cm size xl bust cm shoulder cm sleev cm length cm
26785,B01HEFZJC2,featur beauti fabul detail real silver necklac truli eleg breathtak design look gorgeou amaz white gold plate design durabl long last easi match suitabl style cloth great detail good person jewelri collect perfect occas anniversari engag parti meet date wed daili wear etc ideal gift girl lover f...
26787,B01HF3G4BS,mac internet secur x contain two best sell secur product protect mac malwar network attack intego virusbarri x intego netbarri x togeth ensur mac protect adwar malwar stranger unknown applic tri get design specif mac provid around clock protect detect divers array threat make sure mac given best...
26788,B01HF41TKI,versacheck x quickbook dna secur compliant us canadian bank requir compat versacheck versaink versaton


### Creating shingles

In [56]:
# Given a string input, return the list of shingles
def shingle(s, q, delimiter=' '):
    all_shingles = []
    if delimiter != '':
        words_list = s.split(delimiter)
    else:
        words_list = s
    for i in range (len(words_list)-q+1):
        all_shingles.append(delimiter.join(words_list[i:i+q]))
    return list(set(all_shingles))

Apply shingles to out dataframe

In [57]:
# Apply shingles to the df_asin_description
df_asin_description["shingles"] = df_asin_description["description"].apply(lambda x: shingle(x, 2))
# df_asin_description

### Similarity of sets
Computing Jaccuard similarity

In [58]:
# function that takes an intersection set and a union set and returns the Jaccard similarity
def similarity(intersection_set, union_set):
    return len(intersection_set)/len(union_set)

In [59]:
# input = "In the dynamic landscape of higher education, universities are continually redefining the traditional boundaries of learning. The integration of arts, music, and literature has become a cornerstone in fostering a holistic educational experience. At the heart of this transformation is the commitment to connect students with a diverse range of disciplines, preparing them not only for academic success but also for a life enriched by creativity and cultural understanding. In this context, universities such as New School are pioneering integrated learning models that transcend conventional subject silos. Their innovative approach, backed by cutting-edge teaching methodologies, empowers students to explore the intersections of arts, music, and literature. The vision goes beyond a mere confluence of disciplines; it seeks to create an immersive educational environment where students can seamlessly weave their academic pursuits into the fabric of their daily lives. One key player in this educational evolution is McGraw, a renowned arts author whose work has become a guiding light for both educators and students alike. McGraw's contributions extend beyond the conventional boundaries of a university classroom, resonating with a global audience. His writings not only inspire a love for the arts but also emphasize the transformative power of integrated learning in shaping well-rounded individuals. The concept of an integrated learning environment transcends the boundaries of time and space. It is not confined to the four walls of a classroom; rather, it permeates every facet of a student's journey. In this dynamic world, students are no longer passive recipients of knowledge but active participants in a vibrant community of learners. The university becomes a nexus where diverse ideas converge, fostering a collaborative spirit that extends far beyond graduation. In this interconnected world, the New School's commitment to integrated learning is a beacon of innovation. Students are not just acquiring knowledge; they are forging connections between seemingly disparate fields, discovering the harmonies between arts and sciences, and navigating the rhythms of a multicultural world. This transformative journey prepares them to navigate the complexities of the modern world with a deep appreciation for diversity and a keen sense of intellectual curiosity. As we stand at the intersection of arts, music, and literature, the integrated learning paradigm championed by universities like New School, guided by visionary authors such as McGraw, is shaping the future of education. It is a testament to the idea that learning is not a compartmentalized experience but a symphony of knowledge, where every note, every discipline, plays a crucial role in the harmonious melody of life."

file_input = open("input.txt", "r")
input = file_input.read()
# print(input)
user_description = preprocess_data(input)
user_description = shingle(user_description, 2)  
# intersection_set = set(user_description).intersection(set(df_asin_description.shingles.iloc[0]))
# union_set = set(user_description).union(set(df_asin_description.shingles.iloc[0]))
# # perform similarity
# sim = similarity(intersection_set, union_set)
# print(sim)


In [60]:
# df_asin_description
df_asin_description["similarity"] = df_asin_description["shingles"].apply(lambda x: similarity(set(user_description).intersection(set(x)), set(user_description).union(set(x))))
df_asin_description


Unnamed: 0,asin,description,shingles,similarity
1,0071480935,latin rhythm get kid sing spanish sing watch learn spanish help kid age four eight take giant step learn spanish combin two time honor method kid alway use develop languag skill imit kid sing along simpl melodi charm dvd contain music video featur kid engag fun activ visit anim zoo compar cloth ...,"[sing spanish, children screen, time honor, engag fun, activ visit, step learn, absorb spanish, featur kid, song display, anim zoo, age four, giant step, compar cloth, kid absorb, sing watch, honor method, watch learn, cloth size, subtitl screen, learn spanish, alway use, spanish word, use devel...",0.000000
2,007329506X,connect integr learn system empow student continu adapt deliv precis need need need class time engag effect kelli cowan celebr th anniversari miami univers middletown open admiss campu ohio receiv ph univers louisvil later work univers maryland univers groningen netherland special teach microbio...,"[ohio receiv, celebr th, univers louisvil, th anniversari, exclaim love, mission hear, pre nurs, nonmajor especi, need class, special teach, univers maryland, maryland univers, precis need, nurs dental, pursu undergradu, made person, hygienist encount, campu ohio, continu adapt, teach microbiolo...",0.000000
4,0073525758,anatomi physiolog reveal cat ultim onlin interact cat dissect experi state art program use cat photo combin layer techniqu allow student peel away layer cat reveal structur beneath surfac anatomi physiolog reveal cat also offer anim histolog radiolog imag audio pronunci comprehens quizz use part...,"[book jacki, state art, semest undergradu, combin mcgraw, graduat studi, interact cat, univers north, anatomi physiolog, beneath surfac, layer cat, two semest, layer techniqu, photo combin, program use, undergradu anatomi, dissect experi, pronunci comprehens, cat avail, peel away, away layer, co...",0.000000
5,0077340701,john coburn grew hawaiian island seventh sixteen children receiv associ art degre windward commun colleg graduat honor receiv bachelor degre educ univers hawaii lure busi world five year return first love accept teach posit high school mathemat recogn teacher year soon afterward decis made seek ...,"[confer wide, love accept, interest engag, beauti hope, coburn grew, receiv associ, professor tenur, accept teach, present local, fifteen year, teach posit, varieti topic, mathemat floriss, famili music, chapter phi, degre receiv, oklahoma last, made seek, who among, year teach, two nomin, numer...",0.002967
7,0077410297,live art approach art appreci support student acquisit essenti skill cours mark getlein vivid narr concert mcgraw hill power adapt learn program learnsmart connect art help student understand analyz appreci way art work commun us visual world live art provid foundat life long appreci art critic ...,"[skill cours, activ class, appli guid, skill benefit, museum collect, fourth edit, quickli becam, art africa, concert mcgraw, student understand, guid write, led interest, integr digit, far beyond, art histori, reader longman, literatur longman, foundat life, africa abram, chosen career, adapt l...",0.000000
...,...,...,...,...
26784,B01HD1CQPK,note pleas compar detail size buy use similar cloth compar size size detail size bust cm shoulder cm sleev cm length cm size bust cm shoulder cm sleev cm length cm size l bust cm shoulder cm sleev cm length cm size xl bust cm shoulder cm sleev cm length cm,"[cloth compar, size size, buy use, size bust, note pleas, l bust, length cm, sleev cm, size detail, bust cm, compar size, cm size, pleas compar, size l, compar detail, shoulder cm, size xl, similar cloth, cm shoulder, xl bust, size buy, detail size, use similar, cm sleev, cm length]",0.000000
26785,B01HEFZJC2,featur beauti fabul detail real silver necklac truli eleg breathtak design look gorgeou amaz white gold plate design durabl long last easi match suitabl style cloth great detail good person jewelri collect perfect occas anniversari engag parti meet date wed daili wear etc ideal gift girl lover f...,"[surfac scratch, lot shower, pleas wipe, design look, detail good, person jewelri, substanc pleas, acid alkali, wear sweat, tip pleas, girl lover, cloth clean, style cloth, real silver, sleep etc, collis avoid, last easi, contact acid, collect perfect, parti meet, wed daili, lover friend, etc pl...",0.000000
26787,B01HF3G4BS,mac internet secur x contain two best sell secur product protect mac malwar network attack intego virusbarri x intego netbarri x togeth ensur mac protect adwar malwar stranger unknown applic tri get design specif mac provid around clock protect detect divers array threat make sure mac given best...,"[intego virusbarri, requir mac, os mac, recommend system, intego netbarri, specifiedhard disk, x macintosh, none specifiedhard, possibl secur, around clock, mac maverick, specifiedsupport os, tri get, time mac, x contain, mac given, lion mac, card none, array threat, requir processor, mac malwar...",0.000000
26788,B01HF41TKI,versacheck x quickbook dna secur compliant us canadian bank requir compat versacheck versaink versaton,"[compliant us, requir compat, versacheck x, quickbook dna, bank requir, x quickbook, us canadian, canadian bank, versacheck versaink, compat versacheck, dna secur, versaink versaton, secur compliant]",0.000000


Dataframe sorted by similarity

In [61]:

df_asin_description.sort_values(by="similarity", ascending=False, inplace=True)
df_asin_description


# if os.path.exists("10RecommendedItems.csv"):
#   os.remove("10RecommendedItems.csv")
# df_asin_description[:11].to_csv('10RecommendedItems.csv', index=False)

Unnamed: 0,asin,description,shingles,similarity
3236,B00005YY5V,design creat cd dvd vh label jewel case insert click n design choos photo clip art imag import digit photo use softwar special effect editor color emboss mosaic even oilifi photo click n design circular text featur make design stand use label wizard choos templat creat design preview print eas c...,"[support jpg, design support, label wizard, cd zip, choos photo, easi use, cd dvd, stand use, also come, code click, diskett label, process built, support vh, circular text, make design, creat design, vh label, easili creat, oilifi photo, design process, insert softwar, wfm file, label size, lab...",0.022364
21151,B00ADVH0ZO,make best visual impact possibl microsoft powerpoint present present easi use add microsoft powerpoint allow user take exist powerpoint present make right within microsoft powerpoint present creat within microsoft powerpoint view dtv enabl projector non display use anaglyph red blue glass ribbon...,"[right insid, adjust coupl, easili integr, easi use, pair imag, anaglyph red, aid user, imag right, add photo, present without, without need, directli within, minut adjust, use add, use anaglyph, set present, present support, powerpoint ribbon, make best, attach stereo, add effect, truli compel,...",0.019231
11633,B0007PIHJ4,easili creat edit publish rss feed podcast new rss feed podcast quickli easili creat feedforal advanc featur enabl creat profession look rss feed podcast quickli effici exist rss feed repair enhanc feedforal feedforal support enclosur tag along rss specif field allow user make complet feed featu...,"[exist feed, edit publish, keep feed, feedforal everyth, properti includ, simpli put, advanc feed, feed date, look day, creat profession, effici exist, complet feed, featur automat, day feed, specif exist, podcast quickli, advanc featur, easili creat, default enabl, give profession, tag along, f...",0.016393
20340,B007FWS7QA,mani offic mountain paper document need store introduc dokme featur rich document manag solut attract price point secur easi use document manag system dokme design varieti purpos includ document captur storag search retriev file share dokme adapt busi model maxim access function repositori size ...,"[file edit, choic save, folder dokme, pc email, folder structur, window dokme, need access, solut design, group restrict, use folder, medium size, user quick, easi use, document manag, directli dokme, introduc dokme, bmp gif, chang multipl, file quick, jpeg file, featur allow, attract price, fie...",0.016204
21636,B00C7AXKZ8,nero newest version world best sell multimedia suit bring digit world pc easi organ manag multimedia file well creat edit new digit content nero sleek design user friendli tool make complet project fun enjoy oem product,"[design user, enjoy oem, new digit, sleek design, fun enjoy, friendli tool, version world, newest version, tool make, best sell, world pc, edit new, complet project, oem product, nero newest, digit content, multimedia suit, pc easi, file well, content nero, nero sleek, sell multimedia, make comp...",0.016000
...,...,...,...,...
13182,B000EHQ008,dantz retrospect win profession upgrad,"[retrospect win, profession upgrad, win profession, dantz retrospect]",0.000000
13183,B000EHPZVI,retrospect backup offer complet protect use small midsiz busi home home offic comput contain person famili financi data ever happen inadvert delet file attack viru comput fail lose famili photo copi tax return recov inform quickli keep home offic run smoothli,"[backup offer, inform quickli, famili financi, famili photo, financi data, home offic, protect use, offer complet, offic run, lose famili, small midsiz, person famili, busi home, attack viru, happen inadvert, viru comput, tax return, complet protect, recov inform, comput contain, inadvert delet,...",0.000000
13184,B000EHP626,internet media record give power captur record video includ video stream anim audio favorit internet sourc internet media record give power captur record video includ video stream anim audio favorit internet sourc featur record internet radio,"[favorit internet, anim audio, audio favorit, includ video, video includ, internet sourc, sourc featur, record give, sourc internet, give power, record video, power captur, video stream, featur record, internet radio, record internet, stream anim, internet media, media record, captur record]",0.000000
13185,B000EHS6IC,popcorn help easili make high qualiti copi dvd movi power fit dvd compress technolog fit entir gb dvd video onto standard gb disc backup full dvd creat director cut choos individu dvd video audio languag copi convert movi watch ipod psp gp mobil phone divx handheld popcorn help easili make high ...,"[format portabl, copi top, univers applic, softwar includ, applic optim, handheld popcorn, gb disc, standard gb, fit onto, back entir, qualiti disc, player enjoy, popular portabl, disc imag, improv award, popcorn quickli, toast burn, optim intel, portabl video, file enjoy, file watch, dvd movi, ...",0.000000


In [62]:
print("Similarity of items")
print(df_asin_description.similarity)

Similarity of items
3236     0.022364
21151    0.019231
11633    0.016393
20340    0.016204
21636    0.016000
           ...   
13182    0.000000
13183    0.000000
13184    0.000000
13185    0.000000
26789    0.000000
Name: similarity, Length: 17034, dtype: float64
