# Similar Items System
Program that reads the dataset, preprocess the data and output the most similar items based on a user description of a product.

In [135]:
import json
from collections import defaultdict
import gzip
import pandas as pd
from lxml import html,etree
import numpy as np
import ipywidgets as widgets
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import os

# set stopwords vocabulary
nltk.download('stopwords')

# set tokenizer
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ariannabianchi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ariannabianchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [136]:
# load the meta data
data = []
with gzip.open('Dataset/meta_Software.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))

# with gzip.open('Dataset/meta_CDs_and_Vinyl.json.gz') as f:
#     for l in f:
#         data.append(json.loads(l.strip()))

print("Total number of items in the dataset: ", len(data))

Total number of items in the dataset:  26790


In [137]:
# convert list into pandas dataframe
df2 = pd.DataFrame.from_dict(data)

# set size of display in pandas
pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_rows', 20 )

# first row of the list
print("Columns of the dataset: ", df2.columns)

# show dataframe with columns and rows
# df.head()
# df2.info()


Columns of the dataset:  Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details'],
      dtype='object')


### Preprocess of the data

- Remove empty description
- Remove HTML tag
- Remove URLs
- Remove HTML hidden carachters
- Remove punctuation
- Remove numbers
- Transform every word into lowercase
- Remove stop words
- Perform stemming 

In [138]:
# Drop rows with no description (description is empty)
df2 = df2[df2['description'].map(lambda d: len(d)) > 0]
df2.description
# df2.head()


1        [, <b>Latin rhythms that will get your kids singing in Spanish</b>, <i>Sing, Watch, and Learn Spanish</i> helps your kids ages four through eight take a giant step in learning Spanish by combining two time-honored methods kids have always used to develop their language skills: Imitating other ki...
2        [<b>Connect is the only integrated learning system that empowers students by continuously adapting to deliver precisely what they need, when they need it, how they need it, so that your class time is more engaging and effective.</b><br />, Kelly Cowan just celebrated her 20th anniversary at Miam...
4        [<i>Anatomy &amp; Physiology Revealed Cat</i> is the ultimate online interactive cat dissection experience. This state-of-the-art program uses cat photos combined with a layering technique that allows the student to peel away layers of the cat to reveal structures beneath the surface. <i>Anatomy...
5        [John Coburn grew up in the Hawaiian Islands, the seventh of six

In [139]:
# each description is a list of strings,we want to remove the empty strings, and join the list of strings into one string
df2.description = df2.description.apply(lambda x: [string for string in x if string != ""])
df2.description = df2.description.apply(lambda x: " ".join(x))
df2.iloc[0].description


'<b>Latin rhythms that will get your kids singing in Spanish</b> <i>Sing, Watch, and Learn Spanish</i> helps your kids ages four through eight take a giant step in learning Spanish by combining two time-honored methods kids have always used to develop their language skills: Imitating other kids and singing along with simple melodies. This charming DVD contains 16 music videos featuring kids engaged in fun activities, from visiting animals at the zoo to comparing clothing sizes in grandmas closet. Each video features an original song of authentic Latin rhythms that gets kids singing along with the children on screen. As they watch, listen, and sing along, kids absorb 300 Spanish words, each of which is shouted out in a song and displayed as a subtitle on screen.'

In [140]:
# A lot of the descriptions (and other features) contain HTML.
# The function parses and "translates" into plain text descriptions more suitable for analysis.

def preprocess_data(s):
    stop_words = set(stopwords.words('english'))
    stemmer= PorterStemmer()
    if not s or s.isspace(): 
        # print("Empty description", s, "empty")
        return ''
    try:
        # remove html tags 
        strr = str(html.fromstring(s).text_content())
        # remove URLs
        strr = re.sub(r"(https|http|href)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b", ' ', strr)
        # remove html hidden carachters 
        strr = strr.replace('\n', ' ').replace('\t', ' ').replace("&nbsp", ' ').replace('\r', ' ')
        # remove punctuation
        strr = re.sub(r'[^\w\s]|[_+]', ' ', strr)
        # remove numbers
        strr = re.sub(r'\d+', '', strr)
        # lowercase
        strr = strr.lower()
        # remove stop words
        tokens = nltk.word_tokenize(strr)
        strr = [i for i in tokens if not i in stop_words]
        # stemming
        strr = [stemmer.stem(word) for word in strr]
        strr = ' '.join(strr)
        return strr 
    except etree.ParserError: 
        return ''

# f = open("descriptionHTMLbefore.txt", "w")
# for i in range(5000):
#     f.write(df2.iloc[i].description)
# f.close()

print("Example of description before preprocessing: ")
print(df2.description.iloc[0:2])
df2.description = df2.description.apply(lambda x: preprocess_data(x))
print()
print("Example of description after preprocessing: ")
print(df2.description.iloc[0:2])

# f = open("descriptionHTMLafter.txt", "w")
# for i in range(5000):
#     f.write(df2.iloc[i].description)
# f.close()





Example of description before preprocessing: 
1    <b>Latin rhythms that will get your kids singing in Spanish</b> <i>Sing, Watch, and Learn Spanish</i> helps your kids ages four through eight take a giant step in learning Spanish by combining two time-honored methods kids have always used to develop their language skills: Imitating other kids a...
2    <b>Connect is the only integrated learning system that empowers students by continuously adapting to deliver precisely what they need, when they need it, how they need it, so that your class time is more engaging and effective.</b><br /> Kelly Cowan just celebrated her 20th anniversary at Miami ...
Name: description, dtype: object

Example of description after preprocessing: 
1    latin rhythm get kid sing spanish sing watch learn spanish help kid age four eight take giant step learn spanish combin two time honor method kid alway use develop languag skill imit kid sing along simpl melodi charm dvd contain music video featur kid engag fu

In [141]:
# input = "yewfvn8934fwejnchlvu8h34;vjcek.bi;/kpofu90[q89y7o2g4uioeprwhttps://dsjkvby8ft7ogy3jkn2puvg4nchlvu8h34;vjcek.bi;/kpofsd"
# print(preprocess_data(input))

In [142]:
# # stop_words = set(stopwords.words('english'))
# # print(stop_words)
# input = "she likes to go to take coffee every morning while coming back home"
# stemmer= PorterStemmer()
# print(stemmer.stem(input))
# print(stemmer.stem("every"))

# tokens = nltk.word_tokenize(input)
# # strr = [i for i in tokens if not i in stop_words]
# strr = [stemmer.stem(word) for word in tokens]  
# print(strr)

### Does any product contain different descriptions?  
There exists products which are not unique. The asin and the descriptions are duplicated. 
We process the data in order to have unique products.

In [143]:
# Counting occurence of unique "asin"
asin_count = df2['asin'].value_counts()
# print(asin_count)
asin_more_than_once = asin_count[asin_count > 1].index
# print(asin_more_than_once)
# Step 2: Filter df2 to keep rows where 'asin' is in asin_more_than_once
filtered_df = df2[df2['asin'].isin(asin_more_than_once)]
filtered_df = filtered_df[["asin","description"]].sort_values(by="asin")
# Visual confirmation of duplicates 
filtered_df

Unnamed: 0,asin,description
877,B00000IV94,set comput game share game featur world famou set game comput version offer ten player opportun play togeth one comput solitair comput object game identifi set three card card uniqu four featur number symbol diamond squiggl oval shade solid stripe open color red green purpl set consist three car...
6028,B00000IV94,set comput game share game featur world famou set game comput version offer ten player opportun play togeth one comput solitair comput object game identifi set three card card uniqu four featur number symbol diamond squiggl oval shade solid stripe open color red green purpl set consist three car...
878,B00000IWIZ,star war episod gungan frontier first ever star war ecolog simul game design player older take place episod time period player must transform barren moon naboo thrive balanc ecosystem help gungan build new underwat bubbl citi mantari amphibi transport player control releas fantast creatur exot p...
6029,B00000IWIZ,star war episod gungan frontier first ever star war ecolog simul game design player older take place episod time period player must transform barren moon naboo thrive balanc ecosystem help gungan build new underwat bubbl citi mantari amphibi transport player control releas fantast creatur exot p...
879,B00000J0GM,magellan gp datasend cd rom data manag softwar transfer point interest data gp point use datasend cd rom magellan gp gp global posit system receiv find north american point interest use product first find inform need datasend cd use pc laptop cd rom drive among point interest avail cd museum amu...
...,...,...
6025,B0005MYHMU,discov histori klondik gold rush juli unit state throe depress peopl begin lose hope north came astonish news gold rush ten thousand peopl caught gold fever set remot yukon area northwest canada join rushth yukon trail begin wharf seattl washington jump point thousand stamped your smart persist ...
6027,B0005MYI4W,maker oregon trail seri storybook weaver delux pack thousand stori start imag stimul creativ write english spanish hour creativ possibl text speech featur let hear stori read aloud educ benefit improv write storytel skill illustr stori hundr imag trigger imagin thousand scene color pattern combi...
11178,B0005MYI4W,maker oregon trail seri storybook weaver delux pack thousand stori start imag stimul creativ write english spanish hour creativ possibl text speech featur let hear stori read aloud educ benefit improv write storytel skill illustr stori hundr imag trigger imagin thousand scene color pattern combi...
6026,B0005MYJ0A,cd softwar teach geographi africa engag enjoy comput learn environ


In [144]:
# If "asin" and "description" match -> drop
filtered_df.drop_duplicates(inplace=True)

# How many unique "asin" ?
len(filtered_df.asin.unique())
filtered_df

Unnamed: 0,asin,description
877,B00000IV94,set comput game share game featur world famou set game comput version offer ten player opportun play togeth one comput solitair comput object game identifi set three card card uniqu four featur number symbol diamond squiggl oval shade solid stripe open color red green purpl set consist three car...
878,B00000IWIZ,star war episod gungan frontier first ever star war ecolog simul game design player older take place episod time period player must transform barren moon naboo thrive balanc ecosystem help gungan build new underwat bubbl citi mantari amphibi transport player control releas fantast creatur exot p...
879,B00000J0GM,magellan gp datasend cd rom data manag softwar transfer point interest data gp point use datasend cd rom magellan gp gp global posit system receiv find north american point interest use product first find inform need datasend cd use pc laptop cd rom drive among point interest avail cd museum amu...
881,B00000JIXG,academ version microsoft offic microsoft offic establish posit effici suit applic document creation commun busi inform analysi mani function busi platform evolv paper web microsoft offic extend desktop product web streamlin way work make easier share access analyz inform get better result offic ...
880,B00000JIXM,profession upgrad includ microsoft word word processor microsoft excel spreadsheet microsoft publish desktop publish microsoft access databas manag microsoft powerpoint present graphic mani featur microsoft offic establish posit effici suit applic document creation commun busi inform analysi man...
...,...,...
6024,B0004N37GW,protect child inappropri emailproduct informationth softwar never young plug toth wire world e commun kidmail safeti perfect way letchildren young year old join onlin convers anim desktop theme featur everyth dinosaur rocket shipskidmail safeti allow youngster dress email theme iconsbackground i...
11174,B0004O05WK,rollercoast tycoon time twister expans pack
11176,B0005MYHMU,discov histori klondik gold rush juli unit state throe depress peopl begin lose hope north came astonish news gold rush ten thousand peopl caught gold fever set remot yukon area northwest canada join rushth yukon trail begin wharf seattl washington jump point thousand stamped your smart persist ...
6027,B0005MYI4W,maker oregon trail seri storybook weaver delux pack thousand stori start imag stimul creativ write english spanish hour creativ possibl text speech featur let hear stori read aloud educ benefit improv write storytel skill illustr stori hundr imag trigger imagin thousand scene color pattern combi...


Removing the duplicates products -> now each product is unique

In [145]:
df_asin_description = df2[["asin","description"]].copy()
df_asin_description.drop_duplicates(subset = "description", inplace=True)
# print(len(df_asin_description))
df_asin_description

Unnamed: 0,asin,description
1,0071480935,latin rhythm get kid sing spanish sing watch learn spanish help kid age four eight take giant step learn spanish combin two time honor method kid alway use develop languag skill imit kid sing along simpl melodi charm dvd contain music video featur kid engag fun activ visit anim zoo compar cloth ...
2,007329506X,connect integr learn system empow student continu adapt deliv precis need need need class time engag effect kelli cowan celebr th anniversari miami univers middletown open admiss campu ohio receiv ph univers louisvil later work univers maryland univers groningen netherland special teach microbio...
4,0073525758,anatomi physiolog reveal cat ultim onlin interact cat dissect experi state art program use cat photo combin layer techniqu allow student peel away layer cat reveal structur beneath surfac anatomi physiolog reveal cat also offer anim histolog radiolog imag audio pronunci comprehens quizz use part...
5,0077340701,john coburn grew hawaiian island seventh sixteen children receiv associ art degre windward commun colleg graduat honor receiv bachelor degre educ univers hawaii lure busi world five year return first love accept teach posit high school mathemat recogn teacher year soon afterward decis made seek ...
7,0077410297,live art approach art appreci support student acquisit essenti skill cours mark getlein vivid narr concert mcgraw hill power adapt learn program learnsmart connect art help student understand analyz appreci way art work commun us visual world live art provid foundat life long appreci art critic ...
...,...,...
26784,B01HD1CQPK,note pleas compar detail size buy use similar cloth compar size size detail size bust cm shoulder cm sleev cm length cm size bust cm shoulder cm sleev cm length cm size l bust cm shoulder cm sleev cm length cm size xl bust cm shoulder cm sleev cm length cm
26785,B01HEFZJC2,featur beauti fabul detail real silver necklac truli eleg breathtak design look gorgeou amaz white gold plate design durabl long last easi match suitabl style cloth great detail good person jewelri collect perfect occas anniversari engag parti meet date wed daili wear etc ideal gift girl lover f...
26787,B01HF3G4BS,mac internet secur x contain two best sell secur product protect mac malwar network attack intego virusbarri x intego netbarri x togeth ensur mac protect adwar malwar stranger unknown applic tri get design specif mac provid around clock protect detect divers array threat make sure mac given best...
26788,B01HF41TKI,versacheck x quickbook dna secur compliant us canadian bank requir compat versacheck versaink versaton


### Creating shingles

In [146]:
# Given a string input, return the list of shingles
def shingle(s, q, delimiter=' '):
    all_shingles = []
    if delimiter != '':
        words_list = s.split(delimiter)
    else:
        words_list = s
    for i in range (len(words_list)-q+1):
        all_shingles.append(delimiter.join(words_list[i:i+q]))
    return list(set(all_shingles))

Apply shingles to out dataframe

In [147]:
# Apply shingles to the df_asin_description
df_asin_description["shingles"] = df_asin_description["description"].apply(lambda x: shingle(x, 3))
# df_asin_description

### Similarity of sets
Computing Jaccuard similarity

In [148]:
# function that takes an intersection set and a union set and returns the Jaccard similarity
def similarity(intersection_set, union_set):
    return len(intersection_set)/len(union_set)

In [149]:
# input = "In the dynamic landscape of higher education, universities are continually redefining the traditional boundaries of learning. The integration of arts, music, and literature has become a cornerstone in fostering a holistic educational experience. At the heart of this transformation is the commitment to connect students with a diverse range of disciplines, preparing them not only for academic success but also for a life enriched by creativity and cultural understanding. In this context, universities such as New School are pioneering integrated learning models that transcend conventional subject silos. Their innovative approach, backed by cutting-edge teaching methodologies, empowers students to explore the intersections of arts, music, and literature. The vision goes beyond a mere confluence of disciplines; it seeks to create an immersive educational environment where students can seamlessly weave their academic pursuits into the fabric of their daily lives. One key player in this educational evolution is McGraw, a renowned arts author whose work has become a guiding light for both educators and students alike. McGraw's contributions extend beyond the conventional boundaries of a university classroom, resonating with a global audience. His writings not only inspire a love for the arts but also emphasize the transformative power of integrated learning in shaping well-rounded individuals. The concept of an integrated learning environment transcends the boundaries of time and space. It is not confined to the four walls of a classroom; rather, it permeates every facet of a student's journey. In this dynamic world, students are no longer passive recipients of knowledge but active participants in a vibrant community of learners. The university becomes a nexus where diverse ideas converge, fostering a collaborative spirit that extends far beyond graduation. In this interconnected world, the New School's commitment to integrated learning is a beacon of innovation. Students are not just acquiring knowledge; they are forging connections between seemingly disparate fields, discovering the harmonies between arts and sciences, and navigating the rhythms of a multicultural world. This transformative journey prepares them to navigate the complexities of the modern world with a deep appreciation for diversity and a keen sense of intellectual curiosity. As we stand at the intersection of arts, music, and literature, the integrated learning paradigm championed by universities like New School, guided by visionary authors such as McGraw, is shaping the future of education. It is a testament to the idea that learning is not a compartmentalized experience but a symphony of knowledge, where every note, every discipline, plays a crucial role in the harmonious melody of life."

file_input = open("input.txt", "r")
input = file_input.read()
# print(input)
user_description = preprocess_data(input)
user_description = shingle(user_description, 3)  
# intersection_set = set(user_description).intersection(set(df_asin_description.shingles.iloc[0]))
# union_set = set(user_description).union(set(df_asin_description.shingles.iloc[0]))
# # perform similarity
# sim = similarity(intersection_set, union_set)
# print(sim)


In [150]:
# df_asin_description
df_asin_description["similarity"] = df_asin_description["shingles"].apply(lambda x: similarity(set(user_description).intersection(set(x)), set(user_description).union(set(x))))
df_asin_description


Unnamed: 0,asin,description,shingles,similarity
1,0071480935,latin rhythm get kid sing spanish sing watch learn spanish help kid age four eight take giant step learn spanish combin two time honor method kid alway use develop languag skill imit kid sing along simpl melodi charm dvd contain music video featur kid engag fun activ visit anim zoo compar cloth ...,"[take giant step, word shout song, languag skill imit, kid sing along, authent latin rhythm, eight take giant, spanish combin two, along children screen, absorb spanish word, fun activ visit, origin song authent, four eight take, sing spanish sing, spanish sing watch, sing watch learn, featur ki...",0.0
2,007329506X,connect integr learn system empow student continu adapt deliv precis need need need class time engag effect kelli cowan celebr th anniversari miami univers middletown open admiss campu ohio receiv ph univers louisvil later work univers maryland univers groningen netherland special teach microbio...,"[class time engag, encount everyday situat, need class time, celebr th anniversari, teach microbiolog nonmajor, campu ohio receiv, microbiolog nonmajor especi, love microbiolog class, alli health student, love microbiolog pursu, adapt deliv precis, system empow student, nonmajor especi pre, rece...",0.0
4,0073525758,anatomi physiolog reveal cat ultim onlin interact cat dissect experi state art program use cat photo combin layer techniqu allow student peel away layer cat reveal structur beneath surfac anatomi physiolog reveal cat also offer anim histolog radiolog imag audio pronunci comprehens quizz use part...,"[one two semest, anatomi physiolog human, reveal cat also, quizz use part, avail stand alon, program use cat, also offer anim, scienc grayson counti, undergradu anatomi physiolog, dissect experi state, imag audio pronunci, beneath surfac anatomi, reveal structur beneath, structur beneath surfac,...",0.0
5,0077340701,john coburn grew hawaiian island seventh sixteen children receiv associ art degre windward commun colleg graduat honor receiv bachelor degre educ univers hawaii lure busi world five year return first love accept teach posit high school mathemat recogn teacher year soon afterward decis made seek ...,"[state nation confer, varieti topic love, kappa two nomin, nomin who among, full professor tenur, island seventh sixteen, year mathemat educ, master degre receiv, serv make learn, afterward decis made, john coburn grew, two nomin who, univers oklahoma last, decis made seek, floriss valley campu,...",0.0
7,0077410297,live art approach art appreci support student acquisit essenti skill cours mark getlein vivid narr concert mcgraw hill power adapt learn program learnsmart connect art help student understand analyz appreci way art work commun us visual world live art provid foundat life long appreci art critic ...,"[led interest precari, live art provid, studio museum assign, interest precari life, student far beyond, digit assess challeng, stori art integr, new york pursu, pursu advanc music, classroom chosen career, critic think skill, critic analysi studio, school quickli becam, antholog world literatur...",0.0
...,...,...,...,...
26784,B01HD1CQPK,note pleas compar detail size buy use similar cloth compar size size detail size bust cm shoulder cm sleev cm length cm size bust cm shoulder cm sleev cm length cm size l bust cm shoulder cm sleev cm length cm size xl bust cm shoulder cm sleev cm length cm,"[use similar cloth, size size detail, cm sleev cm, cloth compar size, pleas compar detail, shoulder cm sleev, xl bust cm, length cm size, size buy use, compar size size, l bust cm, cm size bust, cm length cm, buy use similar, size bust cm, size detail size, detail size buy, cm shoulder cm, cm si...",0.0
26785,B01HEFZJC2,featur beauti fabul detail real silver necklac truli eleg breathtak design look gorgeou amaz white gold plate design durabl long last easi match suitabl style cloth great detail good person jewelri collect perfect occas anniversari engag parti meet date wed daili wear etc ideal gift girl lover f...,"[durabl long last, beauti fabul detail, white gold plate, style cloth great, match suitabl style, eleg breathtak design, pleas avoid collis, ideal gift girl, etc pleas wipe, surfac scratch pleas, pleas wipe soft, sleep etc pleas, sweat lot shower, real silver necklac, easi match suitabl, design ...",0.0
26787,B01HF3G4BS,mac internet secur x contain two best sell secur product protect mac malwar network attack intego virusbarri x intego netbarri x togeth ensur mac protect adwar malwar stranger unknown applic tri get design specif mac provid around clock protect detect divers array threat make sure mac given best...,"[virusbarri x intego, mac internet secur, x togeth ensur, mac os sierra, system requir processor, os x macintosh, secur time mac, sierra mac el, threat make sure, malwar network attack, none specifiedhard disk, mac yosemit mac, x contain two, product protect mac, os mac os, adwar malwar stranger...",0.0
26788,B01HF41TKI,versacheck x quickbook dna secur compliant us canadian bank requir compat versacheck versaink versaton,"[quickbook dna secur, dna secur compliant, compliant us canadian, bank requir compat, versacheck versaink versaton, versacheck x quickbook, canadian bank requir, requir compat versacheck, secur compliant us, x quickbook dna, us canadian bank, compat versacheck versaink]",0.0


Dataframe sorted by similarity

In [151]:

df_asin_description.sort_values(by="similarity", ascending=False, inplace=True)
df_asin_description


# if os.path.exists("10RecommendedItems.csv"):
#   os.remove("10RecommendedItems.csv")
# df_asin_description[:11].to_csv('10RecommendedItems.csv', index=False)

Unnamed: 0,asin,description,shingles,similarity
14928,B000Q1RUA6,macdraft profession compil power draft illustr tool one easi use cad softwar packag got full complement draw tool includ line rectangl circl arc polygon curv parallel line freehand tool parallel line polygon tool make easi creat exterior interior wall import file easi adjust edit plu even run na...,"[includ line rectangl, profession alway right, arc polygon curv, make easi creat, intel mac macdraft, tiff gif jpeg, may import imag, power draft illustr, tool make easi, format fulli compat, compil power draft, jpeg psd pict, one easi use, illustr tool one, psd pict png, parallel line freehand,...",0.012987
13311,B000EXQ2DW,movi edit pro give profession easi use featur need creat best movi product softwar make snap anyon transform digit footag great look home video product make cut build special effect transit creat soundtrack cd dvd tool make easi creat anim chapter menu like seen dvd video record memori onto cd d...,"[anyon transform digit, marker flawless video, magix smart design, intuit use integr, featur need creat, video mix effect, make easi creat, transform digit footag, key object curv, tool make easi, make cut build, creat anim chapter, movi edit pro, design easi intuit, give profession easi, make s...",0.012579
25106,B00V1GFJWQ,version familiar offic applic word excel powerpoint includ new featur help creat commun work effici virtual anywher,"[includ new featur, excel powerpoint includ, familiar offic applic, help creat commun, effici virtual anywher, version familiar offic, new featur help, work effici virtual, commun work effici, powerpoint includ new, word excel powerpoint, creat commun work, applic word excel, offic applic word, ...",0.011628
11848,B00093NWVC,complet print pack design meet need come print suit tool requir produc mail label graphic busi card invoic paperwork give document person touch,"[print suit tool, graphic busi card, design meet need, busi card invoic, tool requir produc, complet print pack, card invoic paperwork, come print suit, pack design meet, requir produc mail, document person touch, give document person, print pack design, suit tool requir, paperwork give document...",0.010870
14032,B000ICKKSC,symantec pcanywher world lead remot control solut integr tool make easi helpdesk personnel resolv server workstat problem robust secur prevent unauthor access enterpris resourc,"[workstat problem robust, problem robust secur, lead remot control, server workstat problem, easi helpdesk personnel, world lead remot, control solut integr, tool make easi, robust secur prevent, helpdesk personnel resolv, pcanywher world lead, symantec pcanywher world, resolv server workstat, p...",0.010753
...,...,...,...,...
12184,B0009Y6F56,platform window xp publish topic packag mini retail box hit right note rapid flexibl comput base instruct instant play electr guitar express cd rom softwar suit readi take rudimentari rhythm rockin first lesson electr guitar offer comprehens beginn guid rhythm lead guitar cover everyth instrumen...,"[set finger posit, modern cd rom, tune guitar take, compat sound card, window higher pentium, tool novic profici, color screen resolut, beginn guid rhythm, profici instant play, bit color screen, guitarist alik tune, hd bit color, readi take rudimentari, approach arrang chord, comput base instru...",0.000000
12185,B0009Y6F74,read success express deliv best self pace instruct cd contain number easi follow lesson help develop read skill step step,"[instruct cd contain, pace instruct cd, success express deliv, read success express, cd contain number, self pace instruct, skill step step, deliv best self, read skill step, contain number easi, best self pace, lesson help develop, help develop read, develop read skill, follow lesson help, expr...",0.000000
12186,B0009Y6EX4,middl school success delux introduc middl school success delux lead edg suit softwar tool person comput provid student uniqu divers fundament cours studi middl school success delux zenith self pace instruct cd rom offer award win educ endors content unriv breadth scope curriculum absorb activ ca...,"[provid student uniqu, award win educ, maintain motiv middl, delux learn system, motiv middl school, cours studi middl, school success delux, breadth scope curriculum, learn system repres, repres definit educ, instruct cd rom, pace instruct cd, delux zenith self, self pace instruct, endors conte...",0.000000
12187,B0009Y6EY8,power punch softwar window wall rail roof design home dream click mous power user friendlytech tool exclus instant home design cd rom develop especi home user seek design qualiti altern costli architectur plan instant home design avoid inaccuraci flat photo baseddo design program favor precis ge...,"[user seek design, devot major window, vacant lot virtual, handiwork tour tool, instal plumb electr, user friendli suit, home design take, access virtual content, finish tool also, design qualiti altern, friendlytech tool exclus, screen view option, grade specialti function, specialti function a...",0.000000


In [152]:
print("Similarity of items")
print(df_asin_description.similarity)

Similarity of items
14928    0.012987
13311    0.012579
25106    0.011628
11848    0.010870
14032    0.010753
           ...   
12184    0.000000
12185    0.000000
12186    0.000000
12187    0.000000
26789    0.000000
Name: similarity, Length: 17033, dtype: float64
