# Similar Items System
Program that reads the dataset, preprocess the data and output the most similar items based on a user description of a product.

In [208]:
import json
from collections import defaultdict
import gzip
import pandas as pd
from lxml import html,etree
import numpy as np
import ipywidgets as widgets
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import os
# import preprocess_data 
from preprocess_data import *
# from preprocess_data import user_description_sentiment_analysis


# set stopwords vocabulary
nltk.download('stopwords')

# set tokenizer
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ariannabianchi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ariannabianchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [209]:
# load the meta data
data = []
# with gzip.open('Dataset/meta_Software.json.gz') as f:
#     for l in f:
#         data.append(json.loads(l.strip()))

with gzip.open('Dataset/meta_Digital_Music.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))

print("Total number of items in the dataset: ", len(data))

Total number of items in the dataset:  74347


In [210]:
# convert list into pandas dataframe
df2 = pd.DataFrame.from_dict(data)

# set size of display in pandas
pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_rows', 20 )

# first row of the list
print("Columns of the dataset: ", df2.columns)

# show dataframe with columns and rows
# df.head()
# df2.info()


Columns of the dataset:  Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details'],
      dtype='object')


### Preprocess of the data

- Remove empty description
- Remove HTML tag
- Remove URLs
- Remove HTML hidden carachters
- Remove punctuation
- Remove numbers
- Transform every word into lowercase
- Remove stop words
- Perform stemming 

In [211]:
# Drop rows with no description (description is empty)
df2 = df2[df2['description'].map(lambda d: len(d)) > 0]
df2.description
# df2.head()


4        [1. Losing Game 2. I Can't Wait 3. Didn't He Shine 4. Never Seen...Righteous... 5. A Broken Heart 6. Looking Back 7. Here We Are 8. I Saw The Lord 9. Jesus Is A River Of Love 10. Hittin' The Road 11. I've Never Been Out Of... 12. Jesus Gotta Hold Of My Life 13. Saved- Saved- Saved 14. What Will ...
9                                                                                                                                                                                                                                                                                                                [.]
10       [The Music Connection by Silver Burdett Ginn is a teaching aid for  \nan elementary music or a homeroom teacher. Created by authorities  \nin Music, The Music Connection: by Silver Burdett provides an  \nexcellent foundation for Music studies. Silver Burdetts style is  \nsuited towards Music stu...
12                                                                       

In [212]:
# each description is a list of strings,we want to remove the empty strings, and join the list of strings into one string
df2.description = df2.description.apply(lambda x: [string for string in x if string != ""])
df2.description = df2.description.apply(lambda x: " ".join(x))
df2.iloc[0].description


"1. Losing Game 2. I Can't Wait 3. Didn't He Shine 4. Never Seen...Righteous... 5. A Broken Heart 6. Looking Back 7. Here We Are 8. I Saw The Lord 9. Jesus Is A River Of Love 10. Hittin' The Road 11. I've Never Been Out Of... 12. Jesus Gotta Hold Of My Life 13. Saved- Saved- Saved 14. What Will You Do? 15. Rise Again"

In [213]:
# preprocessing for user description and sentiment analysis 
def user_description_sentiment_analysis(s):
    stop_words = set(stopwords.words('english'))
    stemmer= PorterStemmer()
    if not s or s.isspace(): 
        return ''
    try:
        # remove html tags 
        strr = str(html.fromstring(s).text_content())
        # remove URLs
        strr = re.sub(r"(https|http|href)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b", ' ', strr)
        # remove html hidden carachters 
        strr = strr.replace('\n', ' ').replace('\t', ' ').replace("&nbsp", ' ').replace('\r', ' ')
        # remove punctuation
        strr = re.sub(r'[^\w\s]|[_+]', ' ', strr)
        # lowercase
        strr = strr.lower()
        # remove numbers
        strr = re.sub(r'\d+', '', strr)
        # remove stop words
        tokens = nltk.word_tokenize(strr)
        strr = [i for i in tokens if not i in stop_words]
        # print(len(strr))
        # if (len(strr) == 0):
        #     print("-------------------------")
        strr = ' '.join(strr)
        return strr
        # return str(html.fromstring(s).text_content(s))
    except etree.ParserError: # I am not able to find out why the error occur so i continued by catching the exception. Seem to happen on some empty description strings 
        return ''


Using the function preprocess_data(string) from the file ./preprocess_data.py
- cleaning text
- stop words
- stemming

In [214]:
df_similarity_scores = df2.copy()

print("Example of description before preprocessing: ")
print(df2.description.iloc[0:2])
df2.description = df2.description.apply(lambda x: preprocess_data(x))
print()
print("Example of description after preprocessing: ")
print(df2.description.iloc[0:2])

# f = open("descriptionHTMLafter.txt", "w")
# for i in range(5000):
#     f.write(df2.iloc[i].description)
# f.close()

Example of description before preprocessing: 
4    1. Losing Game 2. I Can't Wait 3. Didn't He Shine 4. Never Seen...Righteous... 5. A Broken Heart 6. Looking Back 7. Here We Are 8. I Saw The Lord 9. Jesus Is A River Of Love 10. Hittin' The Road 11. I've Never Been Out Of... 12. Jesus Gotta Hold Of My Life 13. Saved- Saved- Saved 14. What Will Y...
9                                                                                                                                                                                                                                                                                                              .
Name: description, dtype: object

Example of description after preprocessing: 
4    lose game wait shine never seen righteou broken heart look back saw lord jesu river love hittin road never jesu got ta hold life save save save rise
9                                                                                                              

In [215]:
# # stop_words = set(stopwords.words('english'))
# # print(stop_words)
# input = "she likes to go to take coffee every morning while coming back home"
# stemmer= PorterStemmer()
# print(stemmer.stem(input))
# print(stemmer.stem("every"))

# tokens = nltk.word_tokenize(input)
# # strr = [i for i in tokens if not i in stop_words]
# strr = [stemmer.stem(word) for word in tokens]  
# print(strr)

### Does any product contain different descriptions?  
There exists products which are not unique. The asin and the descriptions are duplicated. 
We process the data in order to have unique products.

In [216]:
# Counting occurence of unique "asin"
asin_count = df2['asin'].value_counts()
# print(asin_count)
asin_more_than_once = asin_count[asin_count > 1].index
# print(asin_more_than_once)
# Step 2: Filter df2 to keep rows where 'asin' is in asin_more_than_once
filtered_df = df2[df2['asin'].isin(asin_more_than_once)]
filtered_df = filtered_df[["asin","description"]].sort_values(by="asin")
# Visual confirmation of duplicates 
filtered_df

Unnamed: 0,asin,description
10293,B00000I8EO,one word tranquil son laid floor put cd within minut jello state son usual walk room readi bed time physic carri room say think like use sooth day stimul other would like use special day music therapi music help mani studi done cours discov exactli music alter mind emot physic state done researc...
1959,B00000I8EO,one word tranquil son laid floor put cd within minut jello state son usual walk room readi bed time physic carri room say think like use sooth day stimul other would like use special day music therapi music help mani studi done cours discov exactli music alter mind emot physic state done researc...
1954,B00000I8FQ,golf first ever origin music cd billi mac tee glorifi histori humor heart today popular game golf uniqu entertain music tribut world greatest sport collect golf song witti sentiment inspir memor music root good time piano pound sound billi hometown new orlean song run gamut beauti ballad regga r...
10288,B00000I8FQ,golf first ever origin music cd billi mac tee glorifi histori humor heart today popular game golf uniqu entertain music tribut world greatest sport collect golf song witti sentiment inspir memor music root good time piano pound sound billi hometown new orlean song run gamut beauti ballad regga r...
10294,B00000I8FV,kevin russel long consid one talent bay area base blue musician kevin russel play produc sever grammi win blue rock artist session work perform album critic acclaim nearli two decad kevin two tour three televis appear bruce springsteen saxophonist clarenc clemon well shot time join journey guita...
...,...,...
10274,B0004RX5UQ,song
18614,B0005F51YK,roadhous blue rock group roadhous friend live cd
10280,B0005F51YK,roadhous blue rock group roadhous friend live cd
10282,B0005ME08Q,christma patti label mahalia jackson


In [217]:
# If "asin" and "description" match -> drop
filtered_df.drop_duplicates(inplace=True)

# How many unique "asin" ?
len(filtered_df.asin.unique())
filtered_df

Unnamed: 0,asin,description
10293,B00000I8EO,one word tranquil son laid floor put cd within minut jello state son usual walk room readi bed time physic carri room say think like use sooth day stimul other would like use special day music therapi music help mani studi done cours discov exactli music alter mind emot physic state done researc...
1954,B00000I8FQ,golf first ever origin music cd billi mac tee glorifi histori humor heart today popular game golf uniqu entertain music tribut world greatest sport collect golf song witti sentiment inspir memor music root good time piano pound sound billi hometown new orlean song run gamut beauti ballad regga r...
10294,B00000I8FV,kevin russel long consid one talent bay area base blue musician kevin russel play produc sever grammi win blue rock artist session work perform album critic acclaim nearli two decad kevin two tour three televis appear bruce springsteen saxophonist clarenc clemon well shot time join journey guita...
1963,B00000I8HT,accoust pop collect includ piano vocalsong full band arrang mellow lush compel new presenc emerg singer songwrit genr bob leon songwrit hall fame
10291,B00000I8I5,contemporari jazz quartet guitar upright bass drum flugelhorn collect michael monsalv origin music record live studio featur l best jazz musician band member michael monsalv guitar jeff beal flugelhorn dave carpent upright bass except track dick weller drum larri steen bass track guitarist compo...
...,...,...
18601,B000456XPC,cd album
18605,B0004J5HHI,anna moffo canteloub villa lobo rachmaninoff canteloub song auvergn lantoueno pastourel laio de rotso bailero passo pel prat grand malur quo uno fenno brezairola villa lobo bachiana brasileira aria cantilena danza martelo rachmanioff vocalis op arr arcadi dubenski anna moffo soprano american sym...
18608,B0004RX5UQ,song
18614,B0005F51YK,roadhous blue rock group roadhous friend live cd


Removing the duplicates products -> now each product is unique

In [218]:
df_asin_description = df2[["asin","description"]].copy()
df_asin_description.drop_duplicates(subset = "description", inplace=True)
# print(len(df_asin_description))
df_asin_description

Unnamed: 0,asin,description
4,0001526146,lose game wait shine never seen righteou broken heart look back saw lord jesu river love hittin road never jesu got ta hold life save save save rise
9,0159024684,
10,0382262921,music connect silver burdett ginn teach aid elementari music homeroom teacher creat author music music connect silver burdett provid excel foundat music studi silver burdett style suit toward music studi teach student materi clearli without overcompl subject contain varieti record vocal track pe...
12,0545069882,spanish know gold edit learn spanish flash
13,0545109620,cd book long sinc vanish great condit classic
...,...,...
74336,B01HG2DW1I,track list butter ball zaq attack zona walk like guv sentiment pacif daylight trombon institut technolog san jose fog citi show crb trombon giant
74338,B01HH5R7LK,coldplay head full dream tour live etihad stadium manchest england june th cd intro head full dream yellow everi teardrop waterfal scientist bird paradis everglow lover japan magic clock midnight charli brown hymn weekend fix hero viva la vida cd adventur lifetim kaleidoscop troubl see soon amaz...
74339,B01HH68B96,known live version that way life goe steam blacktop witha demo version superfici love sang hughi instead chri hick
74342,B01HH7D5KU,free last southsid never gon lose purpl come southsid diamond africa southsid southsid compadr southsid march mad tarentino trap nigga southsid da fam da gram skit southsid night southsid total length


### Creating shingles

In [219]:
# Given a string input, return the list of shingles
def shingle(s, q, delimiter=' '):
    all_shingles = []
    if delimiter != '':
        words_list = s.split(delimiter)
    else:
        words_list = s
    for i in range (len(words_list)-q+1):
        all_shingles.append(delimiter.join(words_list[i:i+q]))
    return list(set(all_shingles))

Apply shingles to out dataframe

In [220]:
# Apply shingles to the df_asin_description
df_asin_description["shingles"] = df_asin_description["description"].apply(lambda x: shingle(x, 3))
# df_asin_description

### Similarity of sets
Computing Jaccuard similarity

In [221]:
# function that takes an intersection set and a union set and returns the Jaccard similarity
def similarity(intersection_set, union_set):
    return len(intersection_set)/len(union_set)

In [222]:
# input = "In the dynamic landscape of higher education, universities are continually redefining the traditional boundaries of learning. The integration of arts, music, and literature has become a cornerstone in fostering a holistic educational experience. At the heart of this transformation is the commitment to connect students with a diverse range of disciplines, preparing them not only for academic success but also for a life enriched by creativity and cultural understanding. In this context, universities such as New School are pioneering integrated learning models that transcend conventional subject silos. Their innovative approach, backed by cutting-edge teaching methodologies, empowers students to explore the intersections of arts, music, and literature. The vision goes beyond a mere confluence of disciplines; it seeks to create an immersive educational environment where students can seamlessly weave their academic pursuits into the fabric of their daily lives. One key player in this educational evolution is McGraw, a renowned arts author whose work has become a guiding light for both educators and students alike. McGraw's contributions extend beyond the conventional boundaries of a university classroom, resonating with a global audience. His writings not only inspire a love for the arts but also emphasize the transformative power of integrated learning in shaping well-rounded individuals. The concept of an integrated learning environment transcends the boundaries of time and space. It is not confined to the four walls of a classroom; rather, it permeates every facet of a student's journey. In this dynamic world, students are no longer passive recipients of knowledge but active participants in a vibrant community of learners. The university becomes a nexus where diverse ideas converge, fostering a collaborative spirit that extends far beyond graduation. In this interconnected world, the New School's commitment to integrated learning is a beacon of innovation. Students are not just acquiring knowledge; they are forging connections between seemingly disparate fields, discovering the harmonies between arts and sciences, and navigating the rhythms of a multicultural world. This transformative journey prepares them to navigate the complexities of the modern world with a deep appreciation for diversity and a keen sense of intellectual curiosity. As we stand at the intersection of arts, music, and literature, the integrated learning paradigm championed by universities like New School, guided by visionary authors such as McGraw, is shaping the future of education. It is a testament to the idea that learning is not a compartmentalized experience but a symphony of knowledge, where every note, every discipline, plays a crucial role in the harmonious melody of life."

file_input = open("input.txt", "r")
input = file_input.read()
# print(input)
user_description = preprocess_data(input)
user_description = shingle(user_description, 3)  
# intersection_set = set(user_description).intersection(set(df_asin_description.shingles.iloc[0]))
# union_set = set(user_description).union(set(df_asin_description.shingles.iloc[0]))
# # perform similarity
# sim = similarity(intersection_set, union_set)
# print(sim)


In [223]:
# df_asin_description
df_asin_description["similarity"] = df_asin_description["shingles"].apply(lambda x: similarity(set(user_description).intersection(set(x)), set(user_description).union(set(x))))
df_asin_description


Unnamed: 0,asin,description,shingles,similarity
4,0001526146,lose game wait shine never seen righteou broken heart look back saw lord jesu river love hittin road never jesu got ta hold life save save save rise,"[lose game wait, wait shine never, jesu river love, lord jesu river, hold life save, road never jesu, save save rise, river love hittin, broken heart look, ta hold life, love hittin road, shine never seen, look back saw, heart look back, saw lord jesu, game wait shine, jesu got ta, got ta hold, ...",0.0
9,0159024684,,[],0.0
10,0382262921,music connect silver burdett ginn teach aid elementari music homeroom teacher creat author music music connect silver burdett provid excel foundat music studi silver burdett style suit toward music studi teach student materi clearli without overcompl subject contain varieti record vocal track pe...,"[toward music studi, varieti record vocal, studi teach student, danc practic tempo, suit toward music, silver burdett style, perform track pick, music homeroom teacher, music studi teach, track pick track, teach aid elementari, burdett provid excel, subject contain varieti, music connect silver,...",0.0
12,0545069882,spanish know gold edit learn spanish flash,"[know gold edit, learn spanish flash, spanish know gold, gold edit learn, edit learn spanish]",0.0
13,0545109620,cd book long sinc vanish great condit classic,"[cd book long, sinc vanish great, vanish great condit, book long sinc, long sinc vanish, great condit classic]",0.0
...,...,...,...,...
74336,B01HG2DW1I,track list butter ball zaq attack zona walk like guv sentiment pacif daylight trombon institut technolog san jose fog citi show crb trombon giant,"[show crb trombon, track list butter, sentiment pacif daylight, zaq attack zona, zona walk like, jose fog citi, walk like guv, attack zona walk, ball zaq attack, technolog san jose, guv sentiment pacif, fog citi show, daylight trombon institut, institut technolog san, citi show crb, like guv sen...",0.0
74338,B01HH5R7LK,coldplay head full dream tour live etihad stadium manchest england june th cd intro head full dream yellow everi teardrop waterfal scientist bird paradis everglow lover japan magic clock midnight charli brown hymn weekend fix hero viva la vida cd adventur lifetim kaleidoscop troubl see soon amaz...,"[coldplay full super, scientist bird paradis, troubl see soon, full dream yellow, amaz day sky, adventur lifetim fix, everi teardrop waterfal, cd intro head, soon amaz day, star live nme, yellow adventur lifetim, head full dream, june th cd, la vida charli, full super bowl, adventur lifetim kale...",0.0
74339,B01HH68B96,known live version that way life goe steam blacktop witha demo version superfici love sang hughi instead chri hick,"[goe steam blacktop, live version that, way life goe, life goe steam, witha demo version, version superfici love, instead chri hick, blacktop witha demo, sang hughi instead, love sang hughi, hughi instead chri, version that way, that way life, known live version, steam blacktop witha, superfici ...",0.0
74342,B01HH7D5KU,free last southsid never gon lose purpl come southsid diamond africa southsid southsid compadr southsid march mad tarentino trap nigga southsid da fam da gram skit southsid night southsid total length,"[last southsid never, purpl come southsid, free last southsid, southsid diamond africa, southsid never gon, never gon lose, skit southsid night, tarentino trap nigga, southsid da fam, southsid compadr southsid, gram skit southsid, da fam da, come southsid diamond, nigga southsid da, southsid nig...",0.0


Dataframe sorted by similarity

In [224]:

df_asin_description.sort_values(by="similarity", ascending=False, inplace=True)
df_asin_description


# if os.path.exists("10RecommendedItems.csv"):
#   os.remove("10RecommendedItems.csv")
# df_asin_description[:11].to_csv('10RecommendedItems.csv', index=False)

Unnamed: 0,asin,description,shingles,similarity
10251,B00032N1V0,grand daddi tap cd short repeat piano orchestr select design meet need teach tap techniqu build routin note contain technic exercis tap fundament terminolog grade routin doubl length cd note avail,"[daddi tap cd, terminolog grade routin, piano orchestr select, meet need teach, length cd note, doubl length cd, tap techniqu build, orchestr select design, technic exercis tap, cd note avail, routin doubl length, tap fundament terminolog, tap cd short, need teach tap, repeat piano orchestr, con...",0.010000
66260,B00M8B98SO,music companion book best sell author max lucado collect wonder backdrop prayer person devot offer song chosen perfectli complement book also includ new song darlen zschech paul rita baloch written specif project,"[written specif project, chosen perfectli complement, darlen zschech paul, sell author max, author max lucado, backdrop prayer person, offer song chosen, song darlen zschech, new song darlen, includ new song, book best sell, music companion book, also includ new, companion book best, best sell a...",0.009901
53,0829736522,combin style rock ska zona releas new product song fill posit messag theme hope motiv touch live young peopl life chang way vision goal group album also includ new version neblina one hit previou product,"[vision goal group, song fill posit, new product song, product song fill, style rock ska, messag theme hope, group album also, version neblina one, releas new product, peopl life chang, ska zona releas, rock ska zona, live young peopl, also includ new, zona releas new, hit previou product, chang...",0.009615
58876,B00B18ULGS,livetun song collect year featur hatsun miku come also includ new song subject chang come bonu dvd unreleas music clip previou song four repres clip subject chang edit avail may dvd disc encod region japan europ middl east subtitl includ,"[collect year featur, hatsun miku come, featur hatsun miku, year featur hatsun, subject chang edit, new song subject, livetun song collect, chang come bonu, clip previou song, japan europ middl, edit avail may, europ middl east, region japan europ, includ new song, chang edit avail, also includ ...",0.009174
54425,B006BAVVFQ,meet alzheim companionship journey inform inspir holist introduct alzheim diseas uniqu audio resourc design meet need busi caregiv provid critic inform engag access format assimil distil mani book resourc meet alzheim compassion bring forth critic point care someon dementia goal reduc caregiv st...,"[poetri myth metaphor, alzheim mostli love, thing think much, provid critic inform, avila meet alzheim, receiv combin reflect, symptom alzheim cure, care receiv combin, find hope recogn, mani book resourc, caregiv stress improv, inform engag access, bring forth critic, critic point care, art sen...",0.006329
...,...,...,...,...
30657,B000RC9692,trombolin jack across way chilli wind shannon kansa citi railroad blue land lincoln waltz g bill blue come along jodi northern white cloud right right waltz c golden west frog lilypad lloyd loar front back smoki mountain schottisch old mountain rocki run,"[chilli wind shannon, right waltz c, trombolin jack across, bill blue come, come along jodi, shannon kansa citi, jack across way, citi railroad blue, way chilli wind, kansa citi railroad, white cloud right, c golden west, back smoki mountain, mountain rocki run, waltz g bill, across way chilli, ...",0.000000
30656,B000RC8FMG,koli shema mi ha ish ori veyishi betza bedami matai hayom lekha dodi mizmor ledavid al taster im zmirot amar hashem leyaakov karev yom yedid nefesh et ruhi tifdeh medley vehu yoshieni lemaan achai vereai zion zion,"[et ruhi tifdeh, lemaan achai vereai, ori veyishi betza, betza bedami matai, bedami matai hayom, im zmirot amar, al taster im, hashem leyaakov karev, matai hayom lekha, tifdeh medley vehu, lekha dodi mizmor, veyishi betza bedami, achai vereai zion, vereai zion zion, medley vehu yoshieni, yoshien...",0.000000
30655,B000RC8J7M,deryn pur ei di r deryn du ar fore dydd nadolig mordaith america dacw nghariad donald ym mhontypridd yr eneth glaf cariad cyntaf even prayer glomen hiraeth feirion yr eneth gadd ei gwrthod llangollen market cyfri r geifr,"[cariad cyntaf even, mhontypridd yr eneth, eneth gadd ei, nadolig mordaith america, pur ei di, gwrthod llangollen market, cyfri r geifr, deryn du ar, nghariad donald ym, glomen hiraeth feirion, yr eneth glaf, feirion yr eneth, yr eneth gadd, fore dydd nadolig, hiraeth feirion yr, ar fore dydd, d...",0.000000
30654,B000RCAF88,holli bear berri wren furz christma day morn appl tree wassail rafe waltz cutti wren le brandevin holli merri men arran ny sheeaghyn troailtagh dro nevez gower wassail count song round coal fire nou yole comen tapster drynker bring us good ale,"[nevez gower wassail, waltz cutti wren, wren furz christma, nou yole comen, dro nevez gower, rafe waltz cutti, round coal fire, song round coal, day morn appl, men arran ny, comen tapster drynker, holli merri men, morn appl tree, furz christma day, ny sheeaghyn troailtagh, yole comen tapster, wr...",0.000000


In [225]:
print("Similarity of items")
print(df_asin_description.iloc[:20].similarity)

Similarity of items
10251    0.010000
66260    0.009901
53       0.009615
58876    0.009174
54425    0.006329
4        0.000000
49073    0.000000
49085    0.000000
49084    0.000000
49083    0.000000
49082    0.000000
49081    0.000000
49079    0.000000
49077    0.000000
49075    0.000000
49070    0.000000
49090    0.000000
49069    0.000000
49068    0.000000
49065    0.000000
Name: similarity, dtype: float64


In [226]:
# preprocessing only for removing html, urls and hidden characters
# def html_url_hidden_chars(s):
#     stop_words = set(stopwords.words('english'))
#     stemmer= PorterStemmer()
#     if not s or s.isspace(): 
#         return ''
#     try:
#         # remove html tags 
#         strr = str(html.fromstring(s).text_content())
#         # remove URLs
#         strr = re.sub(r"(https|http|href)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b", ' ', strr)
#         # remove html hidden carachters 
#         strr = strr.replace('\n', ' ').replace('\t', ' ').replace("&nbsp", ' ').replace('\r', ' ')
#         return strr
#         # return str(html.fromstring(s).text_content(s))
#     except etree.ParserError: # I am not able to find out why the error occur so i continued by catching the exception. Seem to happen on some empty description strings 
        # return ''

In [227]:
# from preprocess_data import html_url_hidden_chars
df_similarity_scores.description = df_similarity_scores.description.apply(lambda x: html_url_hidden_chars(x))
# df_similarity_scores.iloc[:2].description
# (pd.merge(df1, df2, on='company')
df_1 = df_similarity_scores[["asin","description"]].copy()
df_2 = df_asin_description[["asin","similarity"]].copy()
similarity_df = pd.merge(df_1, df_2, on='asin')
# similarity_df = pd.merge(df_similarity_scores, df_asin_description, on='asin')
similarity_df.columns
# similarity_df.sort_values(by="similarity", ascending=False, inplace=True)
# similarity_df.head()

Index(['asin', 'description', 'similarity'], dtype='object')

In [228]:
similarity_df.head()
# similarity_df.sort_values(by="similarity", ascending=False, inplace=True)

Unnamed: 0,asin,description,similarity
0,1526146,1. Losing Game 2. I Can't Wait 3. Didn't He Shine 4. Never Seen...Righteous... 5. A Broken Heart 6. Looking Back 7. Here We Are 8. I Saw The Lord 9. Jesus Is A River Of Love 10. Hittin' The Road 11. I've Never Been Out Of... 12. Jesus Gotta Hold Of My Life 13. Saved- Saved- Saved 14. What Will Y...,0.0
1,159024684,.,0.0
2,382262921,"The Music Connection by Silver Burdett Ginn is a teaching aid for an elementary music or a homeroom teacher. Created by authorities in Music, The Music Connection: by Silver Burdett provides an excellent foundation for Music studies. Silver Burdetts style is suited towards Music studies,...",0.0
3,545069882,Spanish Before You Know It - Gold Edition. Learn Spanish in a Flash!,0.0
4,545109620,Just the CD. The Book has long since vanished. I great condition and it is a classic.,0.0


In [229]:
similarity_dfd = similarity_df.sort_values(by="similarity", ascending=False, inplace=False)
similarity_dfd.head()

Unnamed: 0,asin,description,similarity
6264,B00032N1V0,"The grand-daddy of all tap CDs! Short, repeated piano and orchestrated selections are designed to meet the needs for teaching tap technique and the building of routines. Notes contain technical exercises, tap fundamentals, terminology and graded routines. Double length CD. Notes available.",0.01
6263,B00032N1V0,"The grand-daddy of all tap CDs! Short, repeated piano and orchestrated selections are designed to meet the needs for teaching tap technique and the building of routines. Notes contain technical exercises, tap fundamentals, terminology and graded routines. Double length CD. Notes available.",0.01
26965,B00M8B98SO,"As a musical Companion to the book by best selling author Max Lucado, this collection is a wonderful backdrop for prayer and personal devotion. It offers songs chosen to perfectly complement the book. Also included are new songs by Darlene Zschech and Paul and Rita Baloche which were written spe...",0.009901
26,0829736522,"Combining styles such as rock and ska, Zona 7 releases a new production with songs filled with positive messages. Themes such as hope, and motivation, will touch the lives of young people in a life changing way, which is the vision and goal of the group. This album also includes a new version of...",0.009615
24544,B00B18ULGS,"livetune's song collection for 5 years featuring Hatsune Miku comes out. Also includes new songs (subject to change). Comes with a bonus DVD with unreleased music clips for their previous songs and four representative clips (subject to change). This edition is available only until May 21, 2013. ...",0.009174


In [230]:
similarity_dfd.iloc[:30].to_csv("similarity_results.csv", sep='\t')