# Group similar search terms

## Parameters

In [1]:
from pathlib import Path

BASE_DIR = Path('/Users/efraflores/Desktop/EF/Corner/Catalog/Search_without_results')
FILE_NAME = 'search_wr.csv'
COL_NAME = 'query'

## Import

In [2]:
import pandas as pd

df = pd.read_csv(BASE_DIR.joinpath(FILE_NAME)).dropna().reset_index(drop=True)
print(df.info())
df.sample()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511703 entries, 0 to 511702
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   monday_week  511703 non-null  object
 1   city         511703 non-null  object
 2   zone         511703 non-null  object
 3   store_id     511703 non-null  int64 
 4   store        511703 non-null  object
 5   branch_id    511703 non-null  int64 
 6   branch       511703 non-null  object
 7   user_id      511703 non-null  int64 
 8   query        511703 non-null  object
dtypes: int64(3), object(6)
memory usage: 35.1+ MB
None


Unnamed: 0,monday_week,city,zone,store_id,store,branch_id,branch,user_id,query
379412,2021-08-09,Cancún,Cancún,22,Chedraui,849,Chacmool,1688365,queso manchego reb


## Functions

### Clean text

In [3]:
import re
import unicodedata

def clean_text(text, pattern="[^a-zA-Z\s]", lower=False, unique=False):
    cleaned_text = unicodedata.normalize('NFD',str(text).replace('\n','')).encode('ascii', 'ignore')
    cleaned_text = re.sub(pattern,' ',cleaned_text.decode('utf-8'),flags=re.UNICODE)
    cleaned_text = [word for word in (cleaned_text.lower().split() if lower else cleaned_text.split())]
    return ' '.join((set(cleaned_text) if unique else cleaned_text))

### Similar

In [4]:
from difflib import get_close_matches
from sklearn.neighbors import NearestNeighbors

def similar(data, max_dist=1e3, n_neighbors=5, **kwargs):
    
    df = data.copy()
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree', n_jobs=-1)
    nbrs.fit(df)
    distances, indices = nbrs.kneighbors(df)

    dist_cols = ['dist_'+str(x) for x in range(n_neighbors)]
    distances = pd.DataFrame(distances,
                             index=df.index,
                             columns=dist_cols).iloc[:,1:]

    distances = distances[distances['dist_1'] <= max_dist].copy()
    
    indices_cols = ['indice_'+str(x) for x in range(n_neighbors)]
    indices = pd.DataFrame(indices,
                           index=df.index,
                           columns=indices_cols).iloc[:,1:]
    
    indices.replace(dict(enumerate(df.index)), inplace=True)
    
    neighbors = distances.join(indices)
    
    total_neighbors = []
    for row in neighbors.index:
        aux = []
        for dist_col,id_col in zip(dist_cols[1:],indices_cols[1:]):
            if neighbors.loc[row,dist_col] <= max_dist:
                aux.append(neighbors.loc[row,id_col])
        total_neighbors.append(aux)
    
    neighbors['neighbors'] = total_neighbors

    sim = neighbors['neighbors'].reset_index()
    sim['similar'] = sim.apply(lambda x: ','.join(get_close_matches(x[0],x[1], **kwargs)), axis=1)
    sim = sim[sim['similar'].map(len)>0].copy()
    sim = sim.set_index(sim.columns[0]).iloc[:,-1:]

    melted = sim['similar'].str.split(',', expand=True).melt(ignore_index=False).dropna()
    melted = melted.iloc[:,-1:].rename({'value':'neighbor'}, axis=1).reset_index()
    melted = melted.sort_values(melted.columns.tolist())
    return melted

## Transform

### Queries with more than one user

In [5]:
import numpy as np

mto = df.groupby(COL_NAME)['user_id'].nunique().reset_index()
mto = mto[mto['user_id']>np.percentile(mto['user_id'], 95)].rename({'user_id':'users_count'}, axis=1)
print(mto['users_count'].describe())
mto.shape

count    9956.000000
mean       18.860386
std        31.696622
min         7.000000
25%         8.000000
50%        12.000000
75%        19.000000
max      1491.000000
Name: users_count, dtype: float64


(9956, 2)

In [6]:
df = df.merge(mto, on='query')
print(df.shape)
df.sample()

(215149, 10)


Unnamed: 0,monday_week,city,zone,store_id,store,branch_id,branch,user_id,query,users_count
59627,2021-08-09,Cuernavaca,Cuernavaca,7,Costco,26358,Cuernavaca,16618911,mochi,251


### CountVectorizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

to_vect = df[COL_NAME].drop_duplicates()
cv_char = CountVectorizer(analyzer='char')
cv_char.fit(to_vect.astype(str).apply(clean_text).str.lower())
by_char = pd.DataFrame(cv_char.transform(df[COL_NAME]).todense(), columns=cv_char.get_feature_names(), index=df.index)
by_char = by_char.join(df[[COL_NAME]]).set_index(COL_NAME).drop_duplicates()
by_char = by_char[by_char.sum(axis=1)>2].copy()
print(by_char.shape)
by_char.sample()

(9532, 27)


Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d,e,f,g,h,i,...,q,r,s,t,u,v,w,x,y,z
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mantequ,0,1,0,0,0,1,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0


### Similar

In [8]:
sim = similar(by_char, max_dist=3, cutoff=0.5)
print(sim.shape)
sim.head()

(20113, 2)


Unnamed: 0,query,neighbor
175,1800 cristalino,crosaint
12233,3 carrots,carros
4458,3 carrots,carrot
8493,3 tenchas,cuentas
10750,3v casa madero,casa madero vino 3v


In [9]:
to_omit = sim.merge(mto).sort_values(['users_count',COL_NAME], ascending=False).iloc[:,:-1]
to_omit.head()

Unnamed: 0,query,neighbor
4455,cigarros,cigarro
4456,cigarros,cogarros
4457,cigarros,sigarros
8765,huevo san juan 30 piezas,huevo san juan 12
11034,marlboro,malboro


In [15]:
def omit_neighbors(data):
    col_one, col_two = data.columns.tolist()
    df = data.copy()
    omit_dict = {}
    for x,y in zip(df[col_one], df[col_two]):
        if x not in omit_dict.keys():
            omit_dict[y] = x
            print(omit_dict)
    return omit_dict

neigh_dict = omit_neighbors(to_omit.head(10))

{'cigarro': 'cigarros'}
{'cigarro': 'cigarros', 'cogarros': 'cigarros'}
{'cigarro': 'cigarros', 'cogarros': 'cigarros', 'sigarros': 'cigarros'}
{'cigarro': 'cigarros', 'cogarros': 'cigarros', 'sigarros': 'cigarros', 'huevo san juan 12': 'huevo san juan 30 piezas'}
{'cigarro': 'cigarros', 'cogarros': 'cigarros', 'sigarros': 'cigarros', 'huevo san juan 12': 'huevo san juan 30 piezas', 'malboro': 'marlboro'}
{'cigarro': 'cigarros', 'cogarros': 'cigarros', 'sigarros': 'cigarros', 'huevo san juan 12': 'huevo san juan 30 piezas', 'malboro': 'marlboro', 'marboro': 'marlboro'}
{'cigarro': 'cigarros', 'cogarros': 'cigarros', 'sigarros': 'cigarros', 'huevo san juan 12': 'huevo san juan 30 piezas', 'malboro': 'marlboro', 'marboro': 'marlboro', 'marlboro 100': 'marlboro'}
{'cigarro': 'cigarros', 'cogarros': 'cigarros', 'sigarros': 'cigarros', 'huevo san juan 12': 'huevo san juan 30 piezas', 'malboro': 'marlboro', 'marboro': 'marlboro', 'marlboro 100': 'marlboro', '': 'nupec'}
{'cigarro': 'cigarros

In [16]:
[(x,y) for x,y in neigh_dict.items() if y=='cigarros']

[('cigarro', 'cigarros'), ('cogarros', 'cigarros'), ('sigarros', 'cigarros')]