# Group similar search terms

## Parameters

In [1]:
from pathlib import Path

BASE_DIR = Path('/Users/efraflores/Desktop/EF/Corner/Catalog/Search_without_results')
FILE_NAME = 'search_wr.csv'
COL_NAME = 'query'

## Import

In [2]:
import pandas as pd

df = pd.read_csv(BASE_DIR.joinpath(FILE_NAME)).dropna().reset_index(drop=True)
print(df.info())
df.sample()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511703 entries, 0 to 511702
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   monday_week  511703 non-null  object
 1   city         511703 non-null  object
 2   zone         511703 non-null  object
 3   store_id     511703 non-null  int64 
 4   store        511703 non-null  object
 5   branch_id    511703 non-null  int64 
 6   branch       511703 non-null  object
 7   user_id      511703 non-null  int64 
 8   query        511703 non-null  object
dtypes: int64(3), object(6)
memory usage: 35.1+ MB
None


Unnamed: 0,monday_week,city,zone,store_id,store,branch_id,branch,user_id,query
479094,2021-08-09,Tijuana,Tijuana,10710,Costco Farmacia,29720,Tijuana Río (Tijuana II),18665997,tiburon


## Functions

### Clean text

In [3]:
import re
import unicodedata

def clean_text(text, pattern="[^a-zA-Z\s]", lower=False, unique=False):
    cleaned_text = unicodedata.normalize('NFD',str(text).replace('\n','')).encode('ascii', 'ignore')
    cleaned_text = re.sub(pattern,' ',cleaned_text.decode('utf-8'),flags=re.UNICODE)
    cleaned_text = [word for word in (cleaned_text.lower().split() if lower else cleaned_text.split())]
    return ' '.join((set(cleaned_text) if unique else cleaned_text))

### Similar

In [4]:
from difflib import get_close_matches
from sklearn.neighbors import NearestNeighbors

def similar(data, max_dist=1e3, n_neighbors=5, **kwargs):
    
    df = data.copy()
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree', n_jobs=-1)
    nbrs.fit(df)
    distances, indices = nbrs.kneighbors(df)

    dist_cols = ['dist_'+str(x) for x in range(n_neighbors)]
    distances = pd.DataFrame(distances,
                             index=df.index,
                             columns=dist_cols).iloc[:,1:]

    distances = distances[distances['dist_1'] <= max_dist].copy()
    
    indices_cols = ['indice_'+str(x) for x in range(n_neighbors)]
    indices = pd.DataFrame(indices,
                           index=df.index,
                           columns=indices_cols).iloc[:,1:]
    
    indices.replace(dict(enumerate(df.index)), inplace=True)
    
    neighbors = distances.join(indices)
    
    total_neighbors = []
    for row in neighbors.index:
        aux = []
        for dist_col,id_col in zip(dist_cols[1:],indices_cols[1:]):
            if neighbors.loc[row,dist_col] <= max_dist:
                aux.append(neighbors.loc[row,id_col])
        total_neighbors.append(aux)
    
    neighbors['neighbors'] = total_neighbors

    sim = neighbors['neighbors'].reset_index()
    sim['similar'] = sim.apply(lambda x: ','.join(get_close_matches(x[0],x[1], **kwargs)), axis=1)
    sim = sim[sim['similar'].map(len)>0].copy()
    sim = sim.set_index(sim.columns[0]).iloc[:,-1:]

    melted = sim['similar'].str.split(',', expand=True).melt(ignore_index=False).dropna()
    melted = melted.iloc[:,-1:].rename({'value':'neighbor'}, axis=1).reset_index()
    melted = melted.sort_values(melted.columns.tolist())
    return melted

## Transform

### Date variables

In [5]:
df['monday_week'] = pd.to_datetime(df['monday_week'])
df['year'] = df['monday_week'].dt.year
df['week'] = df['monday_week'].dt.isocalendar().week
df.head()

Unnamed: 0,monday_week,city,zone,store_id,store,branch_id,branch,user_id,query,year,week
0,2021-07-19,Ciudad de México,Aragón,22,Chedraui,3087,Chedraui Tepeyac,1731928,salsa de ostion kiko,2021,29
1,2021-07-19,Ciudad de México,Aragón,22,Chedraui,3087,Chedraui Tepeyac,1731928,salsa de ostion kikoman,2021,29
2,2021-07-19,Monterrey,Suroeste,25,HEB,1490,eFC Aaron Saenz,69587,mini conchas,2021,29
3,2021-07-19,Monterrey,Suroeste,3729,Soriana,10003,Cumbres,3283006,ropa interior para caballero,2021,29
4,2021-07-19,Ciudad de México,Condesa,9,City Market,58,Pilares,11310037,cero humedad,2021,29


### Queries with more than two users

In [6]:
mto = df.groupby(COL_NAME)['user_id'].nunique().reset_index()
mto = mto[mto['user_id']>2].rename({'user_id':'users_count'}, axis=1)
print(mto['users_count'].describe())
mto.shape

count    28042.000000
mean         9.227052
std         20.210311
min          3.000000
25%          3.000000
50%          5.000000
75%          9.000000
max       1491.000000
Name: users_count, dtype: float64


(28042, 2)

In [7]:
df = df.merge(mto, on='query')
print(df.shape)
df.sample()

(293845, 12)


Unnamed: 0,monday_week,city,zone,store_id,store,branch_id,branch,user_id,query,year,week,users_count
19662,2021-07-19,Chihuahua,Chihuahua,2712,Extra,25383,Chapultepec,11276914,electrolit,2021,29,252


### CountVectorizer

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

to_vect = df[COL_NAME].drop_duplicates()
cv_char = CountVectorizer(analyzer='char')
cv_char.fit(to_vect.astype(str).apply(clean_text).str.lower())
by_char = pd.DataFrame(cv_char.transform(df[COL_NAME]).todense(), columns=cv_char.get_feature_names(), index=df.index)
by_char = by_char.join(df[[COL_NAME]]).set_index(COL_NAME).drop_duplicates()
by_char = by_char[by_char.sum(axis=1)>2].copy()
print(by_char.shape)
by_char.sample()

(26144, 27)


Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d,e,f,g,h,i,...,q,r,s,t,u,v,w,x,y,z
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
color corrector,1,0,0,3,0,1,0,0,0,0,...,0,4,0,1,0,0,0,0,0,0


### Similar

In [9]:
sim = similar(by_char, max_dist=3, cutoff=0.7, n=10)
print(sim.shape)
sim.head()

(33410, 2)


Unnamed: 0,query,neighbor
29274,3 carrots,carritos
21348,3 carrots,carros
5860,3 carrots,carrot
15871,3 tenchas,3tenchas
14879,3tenchas,3 tenchas


### Order by count

In [10]:
with_count = sim.merge(mto).merge(mto, left_on='neighbor', right_on='query', suffixes=('','_neigh')).drop('query_neigh', axis=1)
with_count.head()

Unnamed: 0,query,neighbor,users_count,users_count_neigh
0,3 carrots,carritos,9,6
1,carrito,carritos,4,6
2,coriatros,carritos,3,6
3,3 carrots,carros,9,10
4,cugarros,carros,9,10


### First the most used term

In [11]:
aux = []

for query, neigh in zip(zip(with_count[COL_NAME],with_count['users_count']), zip(with_count['neighbor'],with_count['users_count_neigh'])):
    if query[1] > neigh[1]: aux.append((query[0], neigh[0], max(query[1],neigh[1])))
    else: aux.append((neigh[0], query[0], max(query[1],neigh[1])))


to_omit = pd.DataFrame(aux, columns=[COL_NAME,'neigh','count']).sort_values(['count',COL_NAME], ascending=False).drop_duplicates().iloc[:,:-1].reset_index(drop=True)
to_omit.head()

Unnamed: 0,query,neigh
0,cigarros,cigarr9s
1,cigarros,cigaros
2,cigarros,cigarris
3,cigarros,cigarro
4,cigarros,cigarroa


### Neighbors dict

In [12]:
omit_dict = {}
for x,y in to_omit.apply(tuple, axis=1):
    if y not in omit_dict.keys(): omit_dict[y]=x

In [13]:
df['grouped'] = df['query'].map(lambda x: omit_dict[x] if x in omit_dict.keys() else x)
print(len(df['query'].unique()),'\n', len(df['grouped'].unique()),'\n', len(df['grouped'].unique())/len(df['query'].unique()))

28042 
 17447 
 0.6221738820340917


## Export

In [14]:
df.sample(4)

Unnamed: 0,monday_week,city,zone,store_id,store,branch_id,branch,user_id,query,year,week,users_count,grouped
267319,2021-08-02,Tijuana,Tijuana,3729,Soriana,21780,Carrusel,12681997,palomilla,2021,31,5,palillos
259969,2021-07-19,Monterrey,Suroeste,25,HEB,201,Valle alto,1290580,morrón semaforo,2021,29,4,morrón semaforo
41117,2021-07-26,Guadalajara,Guadalajara,3729,Soriana,10523,Forum Tlaquepaque,1625541,chinchetas,2021,30,7,chinchetas
192856,2021-08-09,Monterrey,Suroeste,3980,El más pollo,11642,Suc. Madrid,5058343,tenders,2021,32,17,tender


In [15]:
to_omit.to_csv(BASE_DIR.joinpath('neighbors.csv'), index=False, sep='\t', encoding='utf-16')
df.to_csv(BASE_DIR.joinpath('grouped.csv'), index=False, sep='\t', encoding='utf-16')