In [1]:
import pandas as pd

# Метрика Жаккара  - это отношение пересечения множеств А и Б к их обьединению, то есть отношение количества общих для этих множеств элементов к общему количеству элементов в певром и втором множестве, то есть к их сумме.

In [2]:
price = pd.read_csv('ids_and_prices.csv')

In [3]:
price.head()

Unnamed: 0,id,price
0,b000jz4hqo,0.0
1,b0006zf55o,0.0
2,b00004tkvy,0.0
3,b000g80lqo,599.99
4,b0006se5bq,99.99


In [4]:
detail = pd.read_csv('ids_and_details.csv')
detail.head()

Unnamed: 0,id,name,description,manufacturer
0,b000jz4hqo,clickart 950000 - premier image pack (dvd-rom),massive collection of images & fonts for all y...,
1,b00004tkvy,the beginners bible: noah's ark activity cente...,,
2,b000g80lqo,sage (ptree) - vernfp2007rt - premium accounti...,if you're like most nonprofit organizations yo...,
3,b0006se5bq,singing coach unlimited - electronic learning ...,learn to sing with the help of a patented real...,
4,b00021xhzw,adobe software 22070152 after effects 6.5 pbupgrd,adobe after effects pb 6.5 win upgrade.standar...,


# We have this two tables with data about different goods, and we have to set categories for this goods (to cluster them), but there is just text with decription. So, we may count a part of matching words in the decription. If in two descriptions there are a lot of matching words - we can tell that those goods are from the same category. 

In [5]:
detail.id.value_counts()

b00006jo4g    5
b00006sijr    4
b00004ochi    4
b000090179    4
b00006sijq    4
             ..
b000o27ti0    1
b000hcz9ac    1
b0009i5px0    1
b0000als2s    1
b000j588g4    1
Name: id, Length: 1113, dtype: int64

### If we need to merge two tables it will be better to drop duplicates, beacuse during joining the number of rows may increase dramatically.

In [6]:
detail.drop_duplicates(subset = 'id', inplace=True)


In [7]:
detail.id.value_counts()

b000jz4hqo    1
b0001d56uu    1
b0002fgfiy    1
b000jx1on0    1
b000nknb26    1
             ..
b000ndibe6    1
b000dn7k8y    1
b000cs3s2m    1
b00005wf9v    1
b000j588g4    1
Name: id, Length: 1113, dtype: int64

In [8]:
detail.description.fillna(' ', inplace=True)

In [9]:
detail.head()

Unnamed: 0,id,name,description,manufacturer
0,b000jz4hqo,clickart 950000 - premier image pack (dvd-rom),massive collection of images & fonts for all y...,
1,b00004tkvy,the beginners bible: noah's ark activity cente...,,
2,b000g80lqo,sage (ptree) - vernfp2007rt - premium accounti...,if you're like most nonprofit organizations yo...,
3,b0006se5bq,singing coach unlimited - electronic learning ...,learn to sing with the help of a patented real...,
4,b00021xhzw,adobe software 22070152 after effects 6.5 pbupgrd,adobe after effects pb 6.5 win upgrade.standar...,


In [10]:
both = detail.merge(price, on='id', how='inner')
both.head()

Unnamed: 0,id,name,description,manufacturer,price
0,b000jz4hqo,clickart 950000 - premier image pack (dvd-rom),massive collection of images & fonts for all y...,,0.0
1,b00004tkvy,the beginners bible: noah's ark activity cente...,,,0.0
2,b000g80lqo,sage (ptree) - vernfp2007rt - premium accounti...,if you're like most nonprofit organizations yo...,,599.99
3,b0006se5bq,singing coach unlimited - electronic learning ...,learn to sing with the help of a patented real...,,99.99
4,b00021xhzw,adobe software 22070152 after effects 6.5 pbupgrd,adobe after effects pb 6.5 win upgrade.standar...,,499.99


In [11]:
detail.shape

(1113, 4)

In [12]:
both.shape

(1113, 5)

In [13]:
price.shape

(1363, 2)

# Searching for closest descriptions

For example, we add brand new good in our table. How can we decide that this is a unique good? Not similar with others?
We can, for example, count in it's description - words that maybe match with words from description of some goods in our table.
So, if Jaccard will strive to 1,  it will mean, that descriptions are close, and if Jaccard strive to 0 - right the opposite.

In [36]:
both.loc[1030]

id                                                     b00006jo4g
name            punch software 38100 - punch! super home suite...
description     punch software 38100 : the premium home design...
manufacturer                                                  NaN
price                                                       49.99
Name: 1030, dtype: object

In [37]:
new_text = both.loc[1030].description
new_text

'punch software 38100 : the premium home design software solution that lets you view your ideas in ways no other software can. 7 powerful programs in one plus ultimate deck that launch from the same easy-to-use interface including our patented ...'

In [45]:
new_text_set = set(new_text.split())
new_text_set

{'...',
 '38100',
 '7',
 ':',
 'can.',
 'deck',
 'design',
 'easy-to-use',
 'from',
 'home',
 'ideas',
 'in',
 'including',
 'interface',
 'launch',
 'lets',
 'no',
 'one',
 'other',
 'our',
 'patented',
 'plus',
 'powerful',
 'premium',
 'programs',
 'punch',
 'same',
 'software',
 'solution',
 'that',
 'the',
 'ultimate',
 'view',
 'ways',
 'you',
 'your'}

In [47]:
list_new_text = list(new_text_set)
list_new_text

['can.',
 'our',
 'plus',
 'other',
 'patented',
 'the',
 'easy-to-use',
 'programs',
 ':',
 'ultimate',
 'no',
 'in',
 'including',
 'ways',
 'interface',
 'one',
 '38100',
 'lets',
 'home',
 'launch',
 'solution',
 '...',
 'punch',
 'design',
 'from',
 'ideas',
 'your',
 'you',
 'powerful',
 '7',
 'same',
 'view',
 'software',
 'deck',
 'premium',
 'that']

In [49]:
len(list_new_text)

36

AttributeError: 'Series' object has no attribute 'split'

## For each row in our dataframe Both we will count new column Similarity. We will account each description with help of APPLY feature:
1 we will get set of unique words in description = set(description.split())
2 then we will get list of words, that are common for new_good and those one in the description = list(set(description.split()) & new_text.split())
3 we will count lenght of the intesection that we got len(list(set(description.split()) & new_text.split()))
4 then we count the number of words in the intesection to number of words in new_text


In [57]:
def CountIntersection(row, new_text):
    current_set_for_each_description = set(row['description'].split())
    current_intersection = current_set_for_each_description & new_text
    return len(current_intersection)/len(new_text)

In [59]:
both['Similarity'] = both.apply(CountIntersection,  axis=1, args=(new_text_set,))

In [60]:
both.head()

Unnamed: 0,id,name,description,manufacturer,price,Similarity
0,b000jz4hqo,clickart 950000 - premier image pack (dvd-rom),massive collection of images & fonts for all y...,,0.0,0.138889
1,b00004tkvy,the beginners bible: noah's ark activity cente...,,,0.0,0.0
2,b000g80lqo,sage (ptree) - vernfp2007rt - premium accounti...,if you're like most nonprofit organizations yo...,,599.99,0.138889
3,b0006se5bq,singing coach unlimited - electronic learning ...,learn to sing with the help of a patented real...,,99.99,0.111111
4,b00021xhzw,adobe software 22070152 after effects 6.5 pbupgrd,adobe after effects pb 6.5 win upgrade.standar...,,499.99,0.083333


## Let's find out closest description

In [67]:

both.sort_values('Similarity', ascending=False).head(20)


Unnamed: 0,id,name,description,manufacturer,price,Similarity
1030,b00006jo4g,punch software 38100 - punch! super home suite...,punch software 38100 : the premium home design...,,49.99,1.0
1042,b000063v0z,punch software 24100 - punch! 5 in 1 home desi...,punch software 24100 : power tools for serious...,,39.99,0.638889
1019,b000067fk7,punch software 35100 - punch! home design suit...,punch software 35100 : punch! professional hom...,,69.99,0.472222
947,b00004ochi,punch software 14100 punch! professional home ...,12 powerful programs in one gives you total co...,punch software,89.99,0.388889
768,b000fowht8,encore software 10485 - encore printmaster v.1...,encore software 10485 : if you've got a passio...,,39.99,0.277778
805,b000fp0k0u,encore software 10478 - encore printmaster v.1...,encore software 10478 : if you've got a passio...,,30.66,0.277778
720,b000i4rmfe,ahead software 70115 - nero 7 ultra edition en...,ahead software 70115 : nero 7 ultra edition en...,,99.99,0.277778
1045,b000e65hki,punch software 84100 - punch! home design arch...,punch software 84100 : punch! home design arch...,,199.99,0.277778
639,b00004ochj,punch software 13100 - punch! ultimate deck & ...,punch software 13100 : punch! ultimate deck & ...,,29.0,0.277778
1039,b00007g3ai,punch software 25100 punch! master landscape a...,punch! master landscape & home design offers a...,punch software,99.99,0.25


So we can tell that we may put in one cluster first six rows, and get mean of price for them.


In [75]:
price_for_a_new_one = both.sort_values('Similarity', ascending=False)
price_for_a_new_one.iloc[1:6].price.mean()

54.124