In [3]:
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import sparse_dot_topn.sparse_dot_topn as ct
from scipy.sparse import csr_matrix
import time

In [4]:
df = pd.read_csv('final_result_2.csv')
df.shape

(955721, 4)

In [5]:
# get unique values are unicode

names = df['dish_name'].unique().astype('U')
print(names)
print(len(names))

['Dưa lưới Đài Loan - 1kg' 'Quýt Úc - 1kg' 'Táo Rockit - ống' ...
 'Bún gián' 'Bia cốc' 'Bia ca']
188629


Dữ liệu gồm 188629 sản phẩm, ta sẽ cần convert edit bỏ những ký tự đặc biệt trong những tên sản phẩm này

In [6]:
# tạo function extract only character và tách string ra n-grams

def ngrams(string):
    string = re.sub(r"(([$&+,:;=?@#|ơ'<>.^*()%!-]|\d)+?)",r'', string)
    ngrams = zip(*[string[i:] for i in range(3)])  # thay đổi number để tạo n-gram, ở đây là 3
    return [''.join(ngram) for ngram in ngrams]

In [7]:
ngrams("HỒNG TRÀ SỮA TRÂN CHÂU TRẮNG size nhỏ")

['HỒN',
 'ỒNG',
 'NG ',
 'G T',
 ' TR',
 'TRÀ',
 'RÀ ',
 'À S',
 ' SỮ',
 'SỮA',
 'ỮA ',
 'A T',
 ' TR',
 'TRÂ',
 'RÂN',
 'ÂN ',
 'N C',
 ' CH',
 'CHÂ',
 'HÂU',
 'ÂU ',
 'U T',
 ' TR',
 'TRẮ',
 'RẮN',
 'ẮNG',
 'NG ',
 'G s',
 ' si',
 'siz',
 'ize',
 'ze ',
 'e n',
 ' nh',
 'nhỏ']

In [8]:
# past ngram function để tạo TF-IDF vector
vectorizer = TfidfVectorizer(analyzer=ngrams)

# Build matrix
tfidf_matrix = vectorizer.fit_transform(names)

In [62]:
print(tfidf_matrix[[10,10]])

  (0, 14666)	0.2072466615421564
  (0, 21448)	0.2353809556715529
  (0, 9818)	0.348898746510453
  (0, 1218)	0.30776489273444607
  (0, 13774)	0.26321744665474067
  (0, 29167)	0.2044880816785505
  (0, 23630)	0.19974504869112614
  (0, 9947)	0.22474833086060847
  (0, 1243)	0.17576140938611598
  (0, 19333)	0.22261396462399113
  (0, 25264)	0.21596572750224136
  (0, 6899)	0.20540866102282007
  (0, 675)	0.18992524166019195
  (0, 25794)	0.313516864937289
  (0, 8054)	0.28444736988486863
  (0, 2116)	0.2247923865535271
  (0, 51)	0.220447401746309
  (1, 14666)	0.2072466615421564
  (1, 21448)	0.2353809556715529
  (1, 9818)	0.348898746510453
  (1, 1218)	0.30776489273444607
  (1, 13774)	0.26321744665474067
  (1, 29167)	0.2044880816785505
  (1, 23630)	0.19974504869112614
  (1, 9947)	0.22474833086060847
  (1, 1243)	0.17576140938611598
  (1, 19333)	0.22261396462399113
  (1, 25264)	0.21596572750224136
  (1, 6899)	0.20540866102282007
  (1, 675)	0.18992524166019195
  (1, 25794)	0.313516864937289
  (1, 8054)	0

In [9]:
# tạo function để đo cosine similarity giữa các strings

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)
    
    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)
    return csr_matrix((data,indices,indptr),shape=(M,N))

In [10]:
#  Top 10 with similarity above 0.8
t1 = time.time()
matches = awesome_cossim_top(tfidf_matrix, tfidf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 75.68366146087646


In [11]:
# tạo coordinate matrix
coo_matrix = matches.tocoo()

In [16]:
type(coo_matrix)

scipy.sparse.coo.coo_matrix

In [66]:
# tạo function để nhóm các tên có cosine similaity cao

group_lookup = {}

def find_group(row, col):
    # If either the row or the col string have already been given
    # a group, return that group. Otherwise return none
    if row in group_lookup:
        return group_lookup[row]
    elif col in group_lookup:
        return group_lookup[col]
    else:
        return None


def add_vals_to_lookup(group, row, col):
    # Once we know the group name, set it as the value
    # for both strings in the group_lookup
    group_lookup[row] = group
    group_lookup[col] = group


def add_pair_to_lookup(row, col):
    # in this function we'll add both the row and the col to the lookup
    group = find_group(row, col)  # first, see if one has already been added
    if group is not None:
        # if we already know the group, make sure both row and col are in lookup
        add_vals_to_lookup(group, row, col)
    else:
        # if we get here, we need to add a new group.
        # The name is arbitrary, so just make it the row
        add_vals_to_lookup(row, row, col)
        
for row, col in zip(coo_matrix.row, coo_matrix.col):
    if row != col:
        add_pair_to_lookup(names[row], names[col])

In [72]:
df['Group'] = df['dish_name'].map(group_lookup).fillna(df['dish_name'])

In [73]:
df_grouped = df.pivot_table(index='Group',
                            columns='category_name',
                            aggfunc='size', fill_value=0)


In [74]:
pd.set_option('display.max_rows', 600000)
df_grouped.head(500)

category_name,beefsteak - bò né,bia,burger,bánh bao,bánh cuốn / bánh ướt,bánh huế,bánh kem,bánh mì,bánh tráng,bánh xèo / bánh khọt,...,sữa chua,thức uống lên men,trà,trà sữa,trái cây,tôm,vịt,waffle,xiên que / viên chiên,xôi
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bò kho,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Mì Ốp la + xúc xích,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Combo 3 cơm gà ta quay 1/4 con,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Khóm + ổi,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Nghêu đút lò với phô mai, thịt hun khoái, hành - Vongle cappuccino",0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sữa chua Dầm Coffee,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
xoài keo muối ớt 1 trái,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ép Dâu dưa lới,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
"1 Đĩa Ếch Xào xả, Chuối,hoặc Cà (Tuỳ Quán)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_grouped.shape

In [None]:
df_grouped['cat']=df_grouped.iloc[:,1:66].idxmax(axis=1)

In [None]:
df_grouped.head(500)

In [75]:
df.to_csv('./cat_edit.csv')

In [None]:
# unpacks the resulting sparse matrix
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [None]:
# store the  matches into new dataframe called matched_df and 
# printing 10 samples
matches_df = get_matches_df(matches, names, top=200)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # For removing all exact matches
matches_df.sample(10)