In [1]:
import pandas as pd

from fuzzywuzzy import fuzz
from fuzzywuzzy import process



# Limpieza tablas extraidas desde la API PLACE de Google: 

#### Todas las tablas tienen la misma estructura por lo que la misma limpieza se la aplico a todas.

In [2]:
caribe = pd.read_csv('../data/caribe.csv', encoding='latin1')
coreano = pd.read_csv('../data/coreano.csv', encoding='latin1')
india = pd.read_csv('../data/india.csv', encoding='latin1')
ita = pd.read_csv('../data/ita.csv', encoding='latin1')
japo = pd.read_csv('../data/japo.csv', encoding='latin1')
mex = pd.read_csv('../data/mex.csv', encoding='latin1')
peru = pd.read_csv('../data/peru.csv', encoding='latin1')

In [3]:
coreano.info() # Aqui veo que solo price-level tiene nulos

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   business_status     20 non-null     object 
 1   geometry            20 non-null     object 
 2   name                20 non-null     object 
 3   photos              20 non-null     object 
 4   price_level         12 non-null     float64
 5   rating              20 non-null     float64
 6   types               20 non-null     object 
 7   user_ratings_total  20 non-null     int64  
 8   tipo                20 non-null     object 
dtypes: float64(2), int64(1), object(6)
memory usage: 1.5+ KB


In [4]:
coreano.columns

Index(['business_status', 'geometry', 'name', 'photos', 'price_level',
       'rating', 'types', 'user_ratings_total', 'tipo'],
      dtype='object')

In [5]:
coreano=coreano.drop(columns=['business_status','geometry','tipo'])

In [6]:
coreano.columns = ['Nombre','Foto','Precio','Puntuacion','Servicios', 'Nº opiniones']

In [7]:
coreano.Nombre = coreano.Nombre.str.upper()

In [8]:
coreano.head() 

Unnamed: 0,Nombre,Foto,Precio,Puntuacion,Servicios,Nº opiniones
0,RESTAURANTE COREANO GANGNAM,"[{'height': 2988, 'html_attributions': ['<a hr...",1.0,4.4,"['restaurant', 'food', 'point_of_interest', 'e...",597
1,RESTAURANTE KOREA,"[{'height': 2773, 'html_attributions': ['<a hr...",2.0,4.5,"['restaurant', 'food', 'point_of_interest', 'e...",1922
2,GO HYANG MAT,"[{'height': 3000, 'html_attributions': ['<a hr...",1.0,4.5,"['restaurant', 'food', 'point_of_interest', 'e...",994
3,RESTAURANTE MIDANG,"[{'height': 1669, 'html_attributions': ['<a hr...",2.0,4.7,"['meal_delivery', 'restaurant', 'food', 'point...",762
4,RESTAURANTE MARU,"[{'height': 3024, 'html_attributions': ['<a hr...",2.0,4.3,"['restaurant', 'food', 'point_of_interest', 'e...",847


#### Creo una función para limpiar 🫧🧹 todo de una.

In [9]:
import sys
sys.path.append("..")
from src.funciones import *  #funciones

In [10]:
india = clean(india) 
ita = clean(ita) 
japo = clean(japo)
mex = clean(mex) 
peru = clean(peru)

#### La talba de Caribe tiene una columna menos por lo que la funcion no se puede aplicar

In [11]:
caribe=caribe.drop(columns=['business_status','geometry','tipo'])
caribe.columns = ['Nombre','Foto','Puntuacion','Servicios', 'Nº opiniones']
caribe.Nombre = caribe.Nombre.str.upper()

#### Nuesto principal objetivo 🎯🎯 es tener todos los nombres de los restaurantes de las tablas para poder relacionarlos con la tabla de locales

In [12]:
locales = pd.read_csv('../data/restaurantes.csv', encoding='latin1')

In [13]:
locales.columns = ['o', 'id_local','Nombre']

In [14]:
tabla1 = pd.merge(india,locales, on= 'Nombre',how= 'inner')
tabla2 = pd.merge(ita,locales, on= 'Nombre',how= 'inner')



In [15]:
tabla2

Unnamed: 0,Nombre,Foto,Precio,Puntuacion,Servicios,Nº opiniones,o,id_local
0,TRATTORIA MANZONI,"[{'height': 709, 'html_attributions': ['<a hre...",2,4.3,"['restaurant', 'food', 'point_of_interest', 'e...",555,24727,24728
1,OLIVETO,"[{'height': 3000, 'html_attributions': ['<a hr...",2,4.7,"['restaurant', 'food', 'point_of_interest', 'e...",1133,55470,55471
2,LUNA ROSSA,"[{'height': 3415, 'html_attributions': ['<a hr...",2,4.4,"['restaurant', 'food', 'point_of_interest', 'e...",1864,90057,90058


In [16]:
from multiprocessing import Pool

In [17]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from multiprocessing import Pool

def fuzzy_match(name_df, locales, threshold):

    matches = process.extractBests(name_df, locales['Nombre'], score_cutoff=threshold)
    
    return [{'Nombre_df': name_df, 'Nombre_locales': match[0], 'Score': match[1]} for match in matches]

def fuzzy_merge_with_locales(df_list, locales, threshold=80, num_processes=2):

    pool = Pool(num_processes)
    results = []
    
    for df in df_list:
        results.extend(pool.starmap(fuzzy_match, [(name, locales, threshold) for name in df['Nombre']]))
    
    pool.close()
    pool.join()..


    result = pd.DataFrame(results)
    
    return result


df_list = [india,ita,japo,mex,peru,caribe]

threshold = 80
num_processes = 14  # Puedes ajustar el número de procesos según la cantidad de núcleos de CPU disponibles

merged_data = fuzzy_merge_with_locales(df_list, locales, threshold, num_processes)

In [None]:
merged_data