# Similar Items System
Program that reads the dataset, preprocess the data and output the most similar items based on a user description of a product.

In [25]:
import json
from collections import defaultdict
import gzip
import pandas as pd
from lxml import html,etree
import numpy as np
import ipywidgets as widgets
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import os
from preprocess_data import preprocess_data


# set stopwords vocabulary
nltk.download('stopwords')

# set tokenizer
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ariannabianchi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ariannabianchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [26]:
# load the meta data
data = []
with gzip.open('Dataset/meta_Software.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))

# with gzip.open('Dataset/meta_CDs_and_Vinyl.json.gz') as f:
#     for l in f:
#         data.append(json.loads(l.strip()))

print("Total number of items in the dataset: ", len(data))

Total number of items in the dataset:  26790


In [27]:
# convert list into pandas dataframe
df2 = pd.DataFrame.from_dict(data)

# set size of display in pandas
pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_rows', 20 )

# first row of the list
print("Columns of the dataset: ", df2.columns)

# show dataframe with columns and rows
# df.head()
# df2.info()


Columns of the dataset:  Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details'],
      dtype='object')


### Preprocess of the data

- Remove empty description
- Remove HTML tag
- Remove URLs
- Remove HTML hidden carachters
- Remove punctuation
- Remove numbers
- Transform every word into lowercase
- Remove stop words
- Perform stemming 

In [28]:
# Drop rows with no description (description is empty)
df2 = df2[df2['description'].map(lambda d: len(d)) > 0]
df2.description
# df2.head()


1        [, <b>Latin rhythms that will get your kids singing in Spanish</b>, <i>Sing, Watch, and Learn Spanish</i> helps your kids ages four through eight take a giant step in learning Spanish by combining two time-honored methods kids have always used to develop their language skills: Imitating other ki...
2        [<b>Connect is the only integrated learning system that empowers students by continuously adapting to deliver precisely what they need, when they need it, how they need it, so that your class time is more engaging and effective.</b><br />, Kelly Cowan just celebrated her 20th anniversary at Miam...
4        [<i>Anatomy &amp; Physiology Revealed Cat</i> is the ultimate online interactive cat dissection experience. This state-of-the-art program uses cat photos combined with a layering technique that allows the student to peel away layers of the cat to reveal structures beneath the surface. <i>Anatomy...
5        [John Coburn grew up in the Hawaiian Islands, the seventh of six

In [29]:
# each description is a list of strings,we want to remove the empty strings, and join the list of strings into one string
df2.description = df2.description.apply(lambda x: [string for string in x if string != ""])
df2.description = df2.description.apply(lambda x: " ".join(x))
df2.iloc[0].description


'<b>Latin rhythms that will get your kids singing in Spanish</b> <i>Sing, Watch, and Learn Spanish</i> helps your kids ages four through eight take a giant step in learning Spanish by combining two time-honored methods kids have always used to develop their language skills: Imitating other kids and singing along with simple melodies. This charming DVD contains 16 music videos featuring kids engaged in fun activities, from visiting animals at the zoo to comparing clothing sizes in grandmas closet. Each video features an original song of authentic Latin rhythms that gets kids singing along with the children on screen. As they watch, listen, and sing along, kids absorb 300 Spanish words, each of which is shouted out in a song and displayed as a subtitle on screen.'

In [30]:



# f = open("descriptionHTMLbefore.txt", "w")
# for i in range(5000):
#     f.write(df2.iloc[i].description)
# f.close()
df_similarity_scores = df2.copy()

print("Example of description before preprocessing: ")
print(df2.description.iloc[0:2])
df2.description = df2.description.apply(lambda x: preprocess_data(x))
print()
print("Example of description after preprocessing: ")
print(df2.description.iloc[0:2])

# f = open("descriptionHTMLafter.txt", "w")
# for i in range(5000):
#     f.write(df2.iloc[i].description)
# f.close()





Example of description before preprocessing: 
1    <b>Latin rhythms that will get your kids singing in Spanish</b> <i>Sing, Watch, and Learn Spanish</i> helps your kids ages four through eight take a giant step in learning Spanish by combining two time-honored methods kids have always used to develop their language skills: Imitating other kids a...
2    <b>Connect is the only integrated learning system that empowers students by continuously adapting to deliver precisely what they need, when they need it, how they need it, so that your class time is more engaging and effective.</b><br /> Kelly Cowan just celebrated her 20th anniversary at Miami ...
Name: description, dtype: object



Example of description after preprocessing: 
1    latin rhythm get kid sing spanish sing watch learn spanish help kid age four eight take giant step learn spanish combin two time honor method kid alway use develop languag skill imit kid sing along simpl melodi charm dvd contain music video featur kid engag fun activ visit anim zoo compar cloth ...
2    connect integr learn system empow student continu adapt deliv precis need need need class time engag effect kelli cowan celebr th anniversari miami univers middletown open admiss campu ohio receiv ph univers louisvil later work univers maryland univers groningen netherland special teach microbio...
Name: description, dtype: object


In [31]:
# # stop_words = set(stopwords.words('english'))
# # print(stop_words)
# input = "she likes to go to take coffee every morning while coming back home"
# stemmer= PorterStemmer()
# print(stemmer.stem(input))
# print(stemmer.stem("every"))

# tokens = nltk.word_tokenize(input)
# # strr = [i for i in tokens if not i in stop_words]
# strr = [stemmer.stem(word) for word in tokens]  
# print(strr)

### Does any product contain different descriptions?  
There exists products which are not unique. The asin and the descriptions are duplicated. 
We process the data in order to have unique products.

In [32]:
# Counting occurence of unique "asin"
asin_count = df2['asin'].value_counts()
# print(asin_count)
asin_more_than_once = asin_count[asin_count > 1].index
# print(asin_more_than_once)
# Step 2: Filter df2 to keep rows where 'asin' is in asin_more_than_once
filtered_df = df2[df2['asin'].isin(asin_more_than_once)]
filtered_df = filtered_df[["asin","description"]].sort_values(by="asin")
# Visual confirmation of duplicates 
filtered_df

Unnamed: 0,asin,description
877,B00000IV94,set comput game share game featur world famou set game comput version offer ten player opportun play togeth one comput solitair comput object game identifi set three card card uniqu four featur number symbol diamond squiggl oval shade solid stripe open color red green purpl set consist three car...
6028,B00000IV94,set comput game share game featur world famou set game comput version offer ten player opportun play togeth one comput solitair comput object game identifi set three card card uniqu four featur number symbol diamond squiggl oval shade solid stripe open color red green purpl set consist three car...
878,B00000IWIZ,star war episod gungan frontier first ever star war ecolog simul game design player older take place episod time period player must transform barren moon naboo thrive balanc ecosystem help gungan build new underwat bubbl citi mantari amphibi transport player control releas fantast creatur exot p...
6029,B00000IWIZ,star war episod gungan frontier first ever star war ecolog simul game design player older take place episod time period player must transform barren moon naboo thrive balanc ecosystem help gungan build new underwat bubbl citi mantari amphibi transport player control releas fantast creatur exot p...
879,B00000J0GM,magellan gp datasend cd rom data manag softwar transfer point interest data gp point use datasend cd rom magellan gp gp global posit system receiv find north american point interest use product first find inform need datasend cd use pc laptop cd rom drive among point interest avail cd museum amu...
...,...,...
6025,B0005MYHMU,discov histori klondik gold rush juli unit state throe depress peopl begin lose hope north came astonish news gold rush ten thousand peopl caught gold fever set remot yukon area northwest canada join rushth yukon trail begin wharf seattl washington jump point thousand stamped your smart persist ...
6027,B0005MYI4W,maker oregon trail seri storybook weaver delux pack thousand stori start imag stimul creativ write english spanish hour creativ possibl text speech featur let hear stori read aloud educ benefit improv write storytel skill illustr stori hundr imag trigger imagin thousand scene color pattern combi...
11178,B0005MYI4W,maker oregon trail seri storybook weaver delux pack thousand stori start imag stimul creativ write english spanish hour creativ possibl text speech featur let hear stori read aloud educ benefit improv write storytel skill illustr stori hundr imag trigger imagin thousand scene color pattern combi...
6026,B0005MYJ0A,cd softwar teach geographi africa engag enjoy comput learn environ


In [33]:
# If "asin" and "description" match -> drop
filtered_df.drop_duplicates(inplace=True)

# How many unique "asin" ?
len(filtered_df.asin.unique())
filtered_df

Unnamed: 0,asin,description
877,B00000IV94,set comput game share game featur world famou set game comput version offer ten player opportun play togeth one comput solitair comput object game identifi set three card card uniqu four featur number symbol diamond squiggl oval shade solid stripe open color red green purpl set consist three car...
878,B00000IWIZ,star war episod gungan frontier first ever star war ecolog simul game design player older take place episod time period player must transform barren moon naboo thrive balanc ecosystem help gungan build new underwat bubbl citi mantari amphibi transport player control releas fantast creatur exot p...
879,B00000J0GM,magellan gp datasend cd rom data manag softwar transfer point interest data gp point use datasend cd rom magellan gp gp global posit system receiv find north american point interest use product first find inform need datasend cd use pc laptop cd rom drive among point interest avail cd museum amu...
881,B00000JIXG,academ version microsoft offic microsoft offic establish posit effici suit applic document creation commun busi inform analysi mani function busi platform evolv paper web microsoft offic extend desktop product web streamlin way work make easier share access analyz inform get better result offic ...
880,B00000JIXM,profession upgrad includ microsoft word word processor microsoft excel spreadsheet microsoft publish desktop publish microsoft access databas manag microsoft powerpoint present graphic mani featur microsoft offic establish posit effici suit applic document creation commun busi inform analysi man...
...,...,...
6024,B0004N37GW,protect child inappropri emailproduct informationth softwar never young plug toth wire world e commun kidmail safeti perfect way letchildren young year old join onlin convers anim desktop theme featur everyth dinosaur rocket shipskidmail safeti allow youngster dress email theme iconsbackground i...
11174,B0004O05WK,rollercoast tycoon time twister expans pack
11176,B0005MYHMU,discov histori klondik gold rush juli unit state throe depress peopl begin lose hope north came astonish news gold rush ten thousand peopl caught gold fever set remot yukon area northwest canada join rushth yukon trail begin wharf seattl washington jump point thousand stamped your smart persist ...
6027,B0005MYI4W,maker oregon trail seri storybook weaver delux pack thousand stori start imag stimul creativ write english spanish hour creativ possibl text speech featur let hear stori read aloud educ benefit improv write storytel skill illustr stori hundr imag trigger imagin thousand scene color pattern combi...


Removing the duplicates products -> now each product is unique

In [34]:
df_asin_description = df2[["asin","description"]].copy()
df_asin_description.drop_duplicates(subset = "description", inplace=True)
# print(len(df_asin_description))
df_asin_description

Unnamed: 0,asin,description
1,0071480935,latin rhythm get kid sing spanish sing watch learn spanish help kid age four eight take giant step learn spanish combin two time honor method kid alway use develop languag skill imit kid sing along simpl melodi charm dvd contain music video featur kid engag fun activ visit anim zoo compar cloth ...
2,007329506X,connect integr learn system empow student continu adapt deliv precis need need need class time engag effect kelli cowan celebr th anniversari miami univers middletown open admiss campu ohio receiv ph univers louisvil later work univers maryland univers groningen netherland special teach microbio...
4,0073525758,anatomi physiolog reveal cat ultim onlin interact cat dissect experi state art program use cat photo combin layer techniqu allow student peel away layer cat reveal structur beneath surfac anatomi physiolog reveal cat also offer anim histolog radiolog imag audio pronunci comprehens quizz use part...
5,0077340701,john coburn grew hawaiian island seventh sixteen children receiv associ art degre windward commun colleg graduat honor receiv bachelor degre educ univers hawaii lure busi world five year return first love accept teach posit high school mathemat recogn teacher year soon afterward decis made seek ...
7,0077410297,live art approach art appreci support student acquisit essenti skill cours mark getlein vivid narr concert mcgraw hill power adapt learn program learnsmart connect art help student understand analyz appreci way art work commun us visual world live art provid foundat life long appreci art critic ...
...,...,...
26784,B01HD1CQPK,note pleas compar detail size buy use similar cloth compar size size detail size bust cm shoulder cm sleev cm length cm size bust cm shoulder cm sleev cm length cm size l bust cm shoulder cm sleev cm length cm size xl bust cm shoulder cm sleev cm length cm
26785,B01HEFZJC2,featur beauti fabul detail real silver necklac truli eleg breathtak design look gorgeou amaz white gold plate design durabl long last easi match suitabl style cloth great detail good person jewelri collect perfect occas anniversari engag parti meet date wed daili wear etc ideal gift girl lover f...
26787,B01HF3G4BS,mac internet secur x contain two best sell secur product protect mac malwar network attack intego virusbarri x intego netbarri x togeth ensur mac protect adwar malwar stranger unknown applic tri get design specif mac provid around clock protect detect divers array threat make sure mac given best...
26788,B01HF41TKI,versacheck x quickbook dna secur compliant us canadian bank requir compat versacheck versaink versaton


### Creating shingles

In [35]:
# Given a string input, return the list of shingles
def shingle(s, q, delimiter=' '):
    all_shingles = []
    if delimiter != '':
        words_list = s.split(delimiter)
    else:
        words_list = s
    for i in range (len(words_list)-q+1):
        all_shingles.append(delimiter.join(words_list[i:i+q]))
    return list(set(all_shingles))

Apply shingles to out dataframe

In [36]:
# Apply shingles to the df_asin_description
df_asin_description["shingles"] = df_asin_description["description"].apply(lambda x: shingle(x, 3))
# df_asin_description

### Similarity of sets
Computing Jaccuard similarity

In [37]:
# function that takes an intersection set and a union set and returns the Jaccard similarity
def similarity(intersection_set, union_set):
    return len(intersection_set)/len(union_set)

In [38]:
# input = "In the dynamic landscape of higher education, universities are continually redefining the traditional boundaries of learning. The integration of arts, music, and literature has become a cornerstone in fostering a holistic educational experience. At the heart of this transformation is the commitment to connect students with a diverse range of disciplines, preparing them not only for academic success but also for a life enriched by creativity and cultural understanding. In this context, universities such as New School are pioneering integrated learning models that transcend conventional subject silos. Their innovative approach, backed by cutting-edge teaching methodologies, empowers students to explore the intersections of arts, music, and literature. The vision goes beyond a mere confluence of disciplines; it seeks to create an immersive educational environment where students can seamlessly weave their academic pursuits into the fabric of their daily lives. One key player in this educational evolution is McGraw, a renowned arts author whose work has become a guiding light for both educators and students alike. McGraw's contributions extend beyond the conventional boundaries of a university classroom, resonating with a global audience. His writings not only inspire a love for the arts but also emphasize the transformative power of integrated learning in shaping well-rounded individuals. The concept of an integrated learning environment transcends the boundaries of time and space. It is not confined to the four walls of a classroom; rather, it permeates every facet of a student's journey. In this dynamic world, students are no longer passive recipients of knowledge but active participants in a vibrant community of learners. The university becomes a nexus where diverse ideas converge, fostering a collaborative spirit that extends far beyond graduation. In this interconnected world, the New School's commitment to integrated learning is a beacon of innovation. Students are not just acquiring knowledge; they are forging connections between seemingly disparate fields, discovering the harmonies between arts and sciences, and navigating the rhythms of a multicultural world. This transformative journey prepares them to navigate the complexities of the modern world with a deep appreciation for diversity and a keen sense of intellectual curiosity. As we stand at the intersection of arts, music, and literature, the integrated learning paradigm championed by universities like New School, guided by visionary authors such as McGraw, is shaping the future of education. It is a testament to the idea that learning is not a compartmentalized experience but a symphony of knowledge, where every note, every discipline, plays a crucial role in the harmonious melody of life."

file_input = open("input.txt", "r")
input = file_input.read()
# print(input)
user_description = preprocess_data(input)
user_description = shingle(user_description, 3)  
# intersection_set = set(user_description).intersection(set(df_asin_description.shingles.iloc[0]))
# union_set = set(user_description).union(set(df_asin_description.shingles.iloc[0]))
# # perform similarity
# sim = similarity(intersection_set, union_set)
# print(sim)


In [39]:
# df_asin_description
df_asin_description["similarity"] = df_asin_description["shingles"].apply(lambda x: similarity(set(user_description).intersection(set(x)), set(user_description).union(set(x))))
df_asin_description


Unnamed: 0,asin,description,shingles,similarity
1,0071480935,latin rhythm get kid sing spanish sing watch learn spanish help kid age four eight take giant step learn spanish combin two time honor method kid alway use develop languag skill imit kid sing along simpl melodi charm dvd contain music video featur kid engag fun activ visit anim zoo compar cloth ...,"[help kid age, alway use develop, song display subtitl, time honor method, screen watch listen, honor method kid, video featur origin, spanish help kid, watch listen sing, four eight take, simpl melodi charm, display subtitl screen, kid sing along, age four eight, engag fun activ, use develop la...",0.0
2,007329506X,connect integr learn system empow student continu adapt deliv precis need need need class time engag effect kelli cowan celebr th anniversari miami univers middletown open admiss campu ohio receiv ph univers louisvil later work univers maryland univers groningen netherland special teach microbio...,"[connect integr learn, ph univers louisvil, alli health student, love microbiolog pursu, nurs dental hygienist, love microbiolog class, need class time, campu ohio receiv, time engag effect, empow student continu, univers maryland univers, engag effect kelli, anniversari miami univers, pursu und...",0.0
4,0073525758,anatomi physiolog reveal cat ultim onlin interact cat dissect experi state art program use cat photo combin layer techniqu allow student peel away layer cat reveal structur beneath surfac anatomi physiolog reveal cat also offer anim histolog radiolog imag audio pronunci comprehens quizz use part...,"[counti colleg graduat, techniqu allow student, physiolog human anatomi, layer cat reveal, cat reveal structur, art program use, ultim onlin interact, colleg graduat studi, anim histolog radiolog, quizz use part, avail stand alon, program use cat, imag audio pronunci, combin mcgraw hill, onlin i...",0.0
5,0077340701,john coburn grew hawaiian island seventh sixteen children receiv associ art degre windward commun colleg graduat honor receiv bachelor degre educ univers hawaii lure busi world five year return first love accept teach posit high school mathemat recogn teacher year soon afterward decis made seek ...,"[receiv bachelor degre, recogn teacher year, greater st loui, wide varieti topic, thing beauti hope, megsl made numer, come write serv, island seventh sixteen, year later univers, st loui megsl, degre receiv two, univers hawaii lure, among america teacher, bachelor degre educ, theta kappa two, y...",0.0
7,0077410297,live art approach art appreci support student acquisit essenti skill cours mark getlein vivid narr concert mcgraw hill power adapt learn program learnsmart connect art help student understand analyz appreci way art work commun us visual world live art provid foundat life long appreci art critic ...,"[quickli becam equal, seri ground break, literatur longman antholog, world live art, essenti skill cours, benefit student far, appreci support student, write assign ultim, assign ultim prepar, prepar activ class, student invit activ, learnsmart connect art, harpercollin world reader, guid write ...",0.0
...,...,...,...,...
26784,B01HD1CQPK,note pleas compar detail size buy use similar cloth compar size size detail size bust cm shoulder cm sleev cm length cm size bust cm shoulder cm sleev cm length cm size l bust cm shoulder cm sleev cm length cm size xl bust cm shoulder cm sleev cm length cm,"[l bust cm, detail size buy, pleas compar detail, size bust cm, size detail size, shoulder cm sleev, compar size size, cm size bust, length cm size, note pleas compar, buy use similar, cloth compar size, compar detail size, bust cm shoulder, xl bust cm, use similar cloth, cm size xl, cm length c...",0.0
26785,B01HEFZJC2,featur beauti fabul detail real silver necklac truli eleg breathtak design look gorgeou amaz white gold plate design durabl long last easi match suitabl style cloth great detail good person jewelri collect perfect occas anniversari engag parti meet date wed daili wear etc ideal gift girl lover f...,"[look gorgeou amaz, last easi match, girl lover friend, plate design durabl, detail good person, avoid surfac scratch, contact acid alkali, sweat lot shower, ideal gift girl, design look gorgeou, match suitabl style, anniversari engag parti, easi match suitabl, great detail good, real silver nec...",0.0
26787,B01HF3G4BS,mac internet secur x contain two best sell secur product protect mac malwar network attack intego virusbarri x intego netbarri x togeth ensur mac protect adwar malwar stranger unknown applic tri get design specif mac provid around clock protect detect divers array threat make sure mac given best...,"[protect mac malwar, mac maverick mac, intego netbarri x, card none specifiedsupport, provid around clock, mac el capitan, sell secur product, mac os x, none specifiedhard disk, best sell secur, requir mac recommend, processor intel core, maverick mac mountain, sierra mac el, best possibl secur,...",0.0
26788,B01HF41TKI,versacheck x quickbook dna secur compliant us canadian bank requir compat versacheck versaink versaton,"[compat versacheck versaink, requir compat versacheck, bank requir compat, versacheck versaink versaton, us canadian bank, canadian bank requir, dna secur compliant, secur compliant us, x quickbook dna, versacheck x quickbook, compliant us canadian, quickbook dna secur]",0.0


Dataframe sorted by similarity

In [40]:

df_asin_description.sort_values(by="similarity", ascending=False, inplace=True)
df_asin_description


# if os.path.exists("10RecommendedItems.csv"):
#   os.remove("10RecommendedItems.csv")
# df_asin_description[:11].to_csv('10RecommendedItems.csv', index=False)

Unnamed: 0,asin,description,shingles,similarity
14928,B000Q1RUA6,macdraft profession compil power draft illustr tool one easi use cad softwar packag got full complement draw tool includ line rectangl circl arc polygon curv parallel line freehand tool parallel line polygon tool make easi creat exterior interior wall import file easi adjust edit plu even run na...,"[tiff quicktim support, make easi creat, one easi use, full complement draw, program support dwg, pict tiff gif, packag got full, job whatev may, illustr tool one, imag pict tiff, support format fulli, profession alway right, fulli compat autocad, import file easi, tiff gif jpeg, macdraft profes...",0.012987
13311,B000EXQ2DW,movi edit pro give profession easi use featur need creat best movi product softwar make snap anyon transform digit footag great look home video product make cut build special effect transit creat soundtrack cd dvd tool make easi creat anim chapter menu like seen dvd video record memori onto cd d...,"[make easi creat, recognit chapter marker, new anim vh, fast select sort, dvd menu realist, menu like seen, snap anyon transform, menu realist imag, marker flawless video, select sort dvd, softwar make snap, pro give profession, video product make, movi edit pro, font new anim, like seen dvd, an...",0.012579
25106,B00V1GFJWQ,version familiar offic applic word excel powerpoint includ new featur help creat commun work effici virtual anywher,"[effici virtual anywher, work effici virtual, offic applic word, word excel powerpoint, familiar offic applic, powerpoint includ new, help creat commun, excel powerpoint includ, includ new featur, new featur help, version familiar offic, commun work effici, applic word excel, creat commun work, ...",0.011628
11848,B00093NWVC,complet print pack design meet need come print suit tool requir produc mail label graphic busi card invoic paperwork give document person touch,"[suit tool requir, paperwork give document, busi card invoic, print suit tool, pack design meet, document person touch, card invoic paperwork, print pack design, need come print, meet need come, invoic paperwork give, complet print pack, mail label graphic, give document person, label graphic bu...",0.010870
14032,B000ICKKSC,symantec pcanywher world lead remot control solut integr tool make easi helpdesk personnel resolv server workstat problem robust secur prevent unauthor access enterpris resourc,"[lead remot control, problem robust secur, secur prevent unauthor, remot control solut, integr tool make, helpdesk personnel resolv, control solut integr, tool make easi, resolv server workstat, world lead remot, prevent unauthor access, personnel resolv server, pcanywher world lead, workstat pr...",0.010753
...,...,...,...,...
12184,B0009Y6F56,platform window xp publish topic packag mini retail box hit right note rapid flexibl comput base instruct instant play electr guitar express cd rom softwar suit readi take rudimentari rhythm rockin first lesson electr guitar offer comprehens beginn guid rhythm lead guitar cover everyth instrumen...,"[compat pc mhz, basic techniqu use, guitar take solo, take rudimentari rhythm, guitar express cd, hd bit color, electr guitar profici, bit color screen, rudimentari rhythm rockin, rockin first lesson, guitarist alik tune, express cd rom, mhz faster mb, window xp publish, rom approach arrang, fin...",0.000000
12185,B0009Y6F74,read success express deliv best self pace instruct cd contain number easi follow lesson help develop read skill step step,"[number easi follow, easi follow lesson, develop read skill, best self pace, express deliv best, deliv best self, read skill step, skill step step, help develop read, read success express, instruct cd contain, follow lesson help, lesson help develop, pace instruct cd, success express deliv, self...",0.000000
12186,B0009Y6EX4,middl school success delux introduc middl school success delux lead edg suit softwar tool person comput provid student uniqu divers fundament cours studi middl school success delux zenith self pace instruct cd rom offer award win educ endors content unriv breadth scope curriculum absorb activ ca...,"[studi middl school, content unriv breadth, learn system repres, activ captiv user, success delux zenith, school success delux, success delux introduc, suit softwar tool, delux zenith self, system repres definit, educ achiev middl, middl school student, uniqu divers fundament, pace instruct cd, ...",0.000000
12187,B0009Y6EY8,power punch softwar window wall rail roof design home dream click mous power user friendlytech tool exclus instant home design cd rom develop especi home user seek design qualiti altern costli architectur plan instant home design avoid inaccuraci flat photo baseddo design program favor precis ge...,"[progress renderingtool exclus, pace three dimension, walkthrough whetherremodel one, tool let set, home realiti screen, grade specialti function, exclus instant home, room time user, punch softwar window, plan instant home, time user friendli, stori let instant, virtual content design, tool exc...",0.000000


In [42]:
print("Similarity of items")
print(df_asin_description.similarity)

Similarity of items
14928    0.012987
13311    0.012579
25106    0.011628
11848    0.010870
14032    0.010753
           ...   
12184    0.000000
12185    0.000000
12186    0.000000
12187    0.000000
26789    0.000000
Name: similarity, Length: 17033, dtype: float64


In [41]:
# print("Similarity of items")

# similarity_scored = df2[["asin","similarity"]].copy()
# print(df_asin_description.similarity)

Similarity of items


KeyError: "['similarity'] not in index"

In [None]:
# if os.path.exists("similarity_scored.txt"):
#     os.remove("similarity_scored.txt")
# file = open("similarity_scored.txt", "w")
# scores_list = df_asin_description.similarity.tolist()
# file.write(str(scores_list))
# file.close()