## Euclidean Distance

In [1]:
contents = ['This is a toy post about machine learning. Actually, it contains not much interesting stuff.',
'Imaging databases can get huge.',
'Most imaging databases safe images permanently.',
'Imaging databases store images.',
'Imaging databases store images. Imaging databases store images. Imaging databases store images.']

In [10]:
corpus = [w.lower() for w in contents]

In [8]:
corpus

['This is a toy post about machine learning. Actually, it contains not much interesting stuff.',
 'Imaging databases can get huge.',
 'Most imaging databases safe images permanently.',
 'Imaging databases store images.',
 'Imaging databases store images. Imaging databases store images. Imaging databases store images.']

In [12]:
corpus = []
for doc in contents:
    corpus.append(doc.lower())

In [22]:
#corpus = [doc for doc in contents]

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

In [32]:
X

<5x24 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [34]:
X[0].toarray().flatten()

array([1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 1])

In [21]:
X[1].toarray().flatten()

array([0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [25]:
import math
math.sqrt(sum((X[0].toarray().flatten()-X[1].toarray().flatten())**2))

4.358898943540674

In [35]:
import scipy as sp

In [36]:
def dist_raw(v1,v2):
    delta = v1- v2
    return sp.linalg.norm(delta.toarray())

In [37]:
X[0]

<1x24 sparse matrix of type '<class 'numpy.int64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [38]:
dist_raw(X[0],X[1])

4.358898943540674

In [37]:
from sklearn.metrics.pairwise import euclidean_distances
euclidean_distances(X)

  return f(*args, **kwds)


array([[0.        , 4.35889894, 4.47213595, 4.24264069, 7.07106781],
       [4.35889894, 0.        , 2.64575131, 2.23606798, 5.38516481],
       [4.47213595, 2.64575131, 0.        , 2.        , 4.89897949],
       [4.24264069, 2.23606798, 2.        , 0.        , 4.        ],
       [7.07106781, 5.38516481, 4.89897949, 4.        , 0.        ]])

## Jaccard Similarity

In [40]:
import numpy as np
a = np.array([3,3,1,0,0,2])
(a > 0).astype(int)

array([1, 1, 1, 0, 0, 1])

In [41]:
np.asarray(a,np.bool)

array([ True,  True,  True, False, False,  True])

In [43]:
def jaccard(v1,v2):
    v1 = np.asarray(v1, np.bool)
    v2 = np.asarray(v2, np.bool)
    return np.double(np.bitwise_and(v1, v2).sum()) / np.double(np.bitwise_or(v1, v2).sum())

In [45]:
jaccard(X[3].toarray().flatten(), X[4].toarray().flatten())

1.0

In [49]:
from sklearn.metrics import jaccard_score 

In [51]:
v1 = np.asarray(X[3].toarray().flatten(), np.bool)
v2 = np.asarray(X[4].toarray().flatten(), np.bool)
jaccard_score(v1,v2)

1.0

In [53]:
n = X.shape[0]

In [54]:
m = np.zeros((n,n))

In [59]:
for i in range(n):
    for j in range(n):
        v1 = np.asarray(X[i].toarray().flatten(), np.bool)
        v2 = np.asarray(X[j].toarray().flatten(), np.bool)
        m[i,j] = jaccard_score(v1,v2)

In [60]:
m

array([[1.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 1.        , 0.22222222, 0.28571429, 0.28571429],
       [0.        , 0.22222222, 1.        , 0.42857143, 0.42857143],
       [0.        , 0.28571429, 0.42857143, 1.        , 1.        ],
       [0.        , 0.28571429, 0.42857143, 1.        , 1.        ]])

## Cosine Slimilarity

In [63]:
import math
def cos_similarity(v1,v2):
    return np.dot(v1,v2) / math.sqrt((v1 ** 2)).sum() * math.sqrt((v2 ** 2)).sum()

In [64]:
cos_similarity(X[0].toarray().flatten(), X[1].toarray().flatten())

TypeError: only size-1 arrays can be converted to Python scalars

In [66]:
from sklearn.metrics.pairwise import cosine_similarity

In [67]:
cosine_similarity(X)

array([[1.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 1.        , 0.36514837, 0.4472136 , 0.4472136 ],
       [0.        , 0.36514837, 1.        , 0.61237244, 0.61237244],
       [0.        , 0.4472136 , 0.61237244, 1.        , 1.        ],
       [0.        , 0.4472136 , 0.61237244, 1.        , 1.        ]])

In [69]:
from sklearn.metrics.pairwise import cosine_distances

In [70]:
cosine_distances(X)

array([[0.        , 1.        , 1.        , 1.        , 1.        ],
       [1.        , 0.        , 0.63485163, 0.5527864 , 0.5527864 ],
       [1.        , 0.63485163, 0.        , 0.38762756, 0.38762756],
       [1.        , 0.5527864 , 0.38762756, 0.        , 0.        ],
       [1.        , 0.5527864 , 0.38762756, 0.        , 0.        ]])

In [71]:
#distance = 1 - similarity
#similarity = 1 - distance

## TFIDF

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [73]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)


In [75]:
X.toarray()

array([[0.26726124, 0.26726124, 0.        , 0.26726124, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.26726124,
        0.26726124, 0.26726124, 0.26726124, 0.26726124, 0.        ,
        0.26726124, 0.26726124, 0.        , 0.26726124, 0.        ,
        0.        , 0.26726124, 0.26726124, 0.26726124],
       [0.        , 0.        , 0.52451722, 0.        , 0.29550385,
        0.52451722, 0.52451722, 0.        , 0.29550385, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.27880274,
        0.        , 0.        , 0.33142212, 0.27880274, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.49487286,
        0.        , 0.        , 0.49487286, 0.        , 0.49487286,
        0.        , 0.        , 0.        , 0.        ],
       [0.   

In [76]:
from sklearn.metrics.pairwise import cosine_similarity

In [77]:
cosine_distances(X)

array([[0.        , 1.        , 1.        , 1.        , 1.        ],
       [1.        , 0.        , 0.83522543, 0.74716148, 0.74716148],
       [1.        , 0.83522543, 0.        , 0.59290619, 0.59290619],
       [1.        , 0.74716148, 0.59290619, 0.        , 0.        ],
       [1.        , 0.74716148, 0.59290619, 0.        , 0.        ]])

## 蝦皮爬蟲

In [81]:
import requests
import pandas as pd

  return f(*args, **kwds)


In [83]:
df = pd.read_csv('https://raw.githubusercontent.com/ywchiu/tibame_tm/master/data/shopee_laptop.csv',index_col=0)

In [84]:
df

Unnamed: 0,name
0,APPLE MacBook Air系列 i3 雙核心/8G/256G/銀/金/灰 13吋筆電...
1,APPLE MacBook Air系列 i5/8G/128G/銀/金/灰 13吋筆電 201...
2,華碩 ASUS TUF Gaming FA506IU 15.6吋 144Hz 電競筆電 R7...
3,ASUS 華碩 Laptop 14 X409 X409MA-0061GN4100 14吋 灰...
4,Microsoft微軟 Surface Go 2–4425Y/8G/128G/10.5吋平板...
...,...
245,清倉【LENOVO】700 (i5-6300HQ4G1TBGTX950M15吋FHD)出清1...
246,免運費 四核心電腦主機 4G記憶體 英雄聯盟 文書 上網 天堂 LOL CS SF 魔獸世...
247,Macbook Pro Air 2013 2014 2015 SSD 轉 USB 3.0 外接盒
248,%限時促銷%全新華碩 U303L UX303 UX303L UX303LN UX305 UX...


In [None]:
products = []
for item in res.json()['items'] :
    products.append({'name':item['name']}) df = pandas.DataFrame(products)
    df.head()

In [None]:
for idx,product in df.iterrows():