# Brands similarity

In [1]:
from pathlib import Path

BASE_DIR = Path('/Users/efraflores/Desktop/EF/Corner/Brands/brands_similarity')
FILE_BASE_NAME = 'danone'

## Functions

### Timing and tone

In [2]:
import time
import numpy as np
from IPython.lib.display import Audio

start = time.time()
def time_exp(x):
    minutes, seconds = np.floor(x/60), 60*(x/60-np.floor(x/60))
    print(f"{'{:.0f}'.format(minutes)} minutos con {'{:.2f}'.format(seconds)} segundos")
    
def tono(a=1000, b=700, play_time_seconds=1, framerate=4410):
    t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)*np.pi
    return Audio(np.sin(a*t) + np.sin(b*t), rate=framerate, autoplay=True)

### Clean text

In [3]:
import re
import unicodedata

def clean_text(text, pattern="[^a-zA-Z0-9 ]", lower=False, unique=False):
    cleaned_text = unicodedata.normalize('NFD',str(text).replace('\n','')).encode('ascii', 'ignore')
    cleaned_text = re.sub(pattern,' ',cleaned_text.decode('utf-8'),flags=re.UNICODE)
    cleaned_text = [word for word in (cleaned_text.lower().split() if lower else cleaned_text.split())]
    return ' '.join((set(cleaned_text) if unique else cleaned_text))

### Similar

In [4]:
from sklearn.neighbors import NearestNeighbors

def similar(data, max_dist=1e3, **nearestkwargs):
    
    df = data.copy()
    nbrs = NearestNeighbors(algorithm='ball_tree', **nearestkwargs)
    nbrs.fit(df)
    n_neighbors = nbrs.get_params()['n_neighbors']
    distances, indices = nbrs.kneighbors(df)

    dist_cols = ['dist_'+str(x) for x in range(n_neighbors)]
    distances = pd.DataFrame(distances,
                             index=df.index,
                             columns=dist_cols).iloc[:,1:]

    distances = distances[distances['dist_1'] <= max_dist].copy()
    
    indices_cols = ['indice_'+str(x) for x in range(n_neighbors)]
    indices = pd.DataFrame(indices,
                           index=df.index,
                           columns=indices_cols).iloc[:,1:]
    
    indices.replace(dict(zip(range(len(indices)),df.index)),inplace = True)
    
    neighbors = distances.join(indices)
    
    total_neighbors = []
    for row in neighbors.index:
        aux = []
        for dist_col,id_col in zip(dist_cols[1:],indices_cols[1:]):
            if neighbors.loc[row,dist_col] <= max_dist:
                aux.append(neighbors.loc[row,id_col])
        total_neighbors.append(aux)
    
    neighbors['neighbors'] = [', '.join(x) for x in total_neighbors]

    return neighbors

### Omit duplicated neighbors

In [5]:
def omit_dupli_neighbors(data, first_col, second_col):
    df = data.copy()
    df['union'] = df[[first_col,second_col]].apply(lambda x: ''.join(set(x)), axis=1)
    df = df.drop_duplicates('union').drop(columns='union')
    return df

## Import

In [6]:
import pandas as pd

df = pd.read_csv(BASE_DIR.joinpath(FILE_BASE_NAME+'_brands.csv')).drop_duplicates()
print(len(df))
display(df.sample())

362


Unnamed: 0,marca_generica
40,CIEL


## Transform

In [7]:
df['brand'] = df['marca_generica'].apply(clean_text)
df.set_index('marca_generica', inplace=True)
df.sample()

Unnamed: 0_level_0,brand
marca_generica,Unnamed: 1_level_1
VEGGIE,VEGGIE


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

cv_char = CountVectorizer(analyzer='char')
cv_char.fit(df['brand'])
by_char = pd.DataFrame(cv_char.transform(df['brand']).todense(), columns=cv_char.get_feature_names(), index=df.index)

cv_word = CountVectorizer(analyzer='word')
cv_word.fit(df['brand'])
by_word = pd.DataFrame(cv_word.transform(df['brand']).todense(), columns=cv_word.get_feature_names(), index=df.index)*10

X = by_char.join(by_word)
X.sample()

Unnamed: 0_level_0,Unnamed: 1_level_0,1,2,3,4,5,6,9,a,b,...,yoplait,yopro,your,yox,zahini,zarzal,zero,zoe,zorba,zuko
marca_generica,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OZARKA,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
sim = similar(X, max_dist=13, n_neighbors=10)
sim.sample()

Unnamed: 0_level_0,dist_1,dist_2,dist_3,dist_4,dist_5,dist_6,dist_7,dist_8,dist_9,indice_1,indice_2,indice_3,indice_4,indice_5,indice_6,indice_7,indice_8,indice_9,neighbors
marca_generica,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
NESTLÉ AGÜITAS,10.488088,10.488088,17.464249,17.578396,17.578396,17.578396,17.578396,17.578396,17.635192,NESTLÉ,NESTLE,ESSENTIAL,SANTÉ,ALPINETTE,ABATILLES,FUENSANTA,SUCULENTA,HEARTBEST,"NESTLÉ, NESTLE"


In [10]:
neigh_list = sim[['neighbors']].copy()
neigh_list = neigh_list['neighbors'].str.split(',', expand=True).melt(ignore_index=False)
neigh_list = neigh_list.iloc[:,1:].dropna().sort_index().rename(columns={'value':'neighbors'})
neigh_list.sample()

Unnamed: 0_level_0,neighbors
marca_generica,Unnamed: 1_level_1
SANTA MARIA,SANTA MARIA


In [11]:
export = omit_dupli_neighbors(neigh_list.reset_index(), 'marca_generica', 'neighbors')
print(neigh_list.shape, export.shape)

(124, 1) (103, 2)


## Export

In [12]:
export.set_index('neighbors').to_excel(BASE_DIR.joinpath(FILE_BASE_NAME+'_similar.xlsx'))

## End

In [13]:
time_exp(time.time()-start)
tono()

0 minutos con 2.05 segundos
