In [1]:
import pandas as pd
import numpy as np
import re
import unidecode

In [2]:
df_douglas = pd.read_csv("datos_perfumes_douglas_filtrados.csv")

df_notino = pd.read_csv("datos_perfumes_notino_filtrados.csv")

df_paco = pd.read_csv("datos_perfumes_paco_filtrados.csv")

df_perfumeriacom = pd.read_csv("datos_perfumes_perfumeriacom_filtrados.csv")

df_prefumes24 = pd.read_csv("datos_perfumes_prefumes24_filtrados.csv")

df_primor = pd.read_csv("datos_perfumes_primor_filtrados.csv")

df_druni = pd.read_csv("datos_perfumes_druni_filtrados.csv")

In [3]:
df = pd.concat([df_douglas, df_notino, df_paco, df_perfumeriacom, df_prefumes24, df_primor, df_druni], ignore_index=True)

In [4]:
# Quito cualquier tipo de acento:
df['marca'] = df['marca'].apply(lambda x: unidecode.unidecode(x))
df['nombre'] = df['nombre'].apply(lambda x: unidecode.unidecode(x))

In [5]:
def mapear_marcas(df):
    
    mapping = {
        "zarkoperfume"            : "zarko perfume",
        "women'secret"            : "women's secret",
        "viktor&rolf"             : "viktor & rolf",
        "van cleef&arpels"        : "van cleef & arpels",
        "polo"                    : "",
        "beverly hills polo club" : "giorgio beverly hills",
        "giorgio beverly"         : "giorgio beverly hills",
        "agatha"                  : "agatha ruiz de la prada",
        "agatha paris"            : "agatha ruiz de la prada",
        "dolce&gabbana"           : "dolce & gabbana",
        "escentric"               : "escentric molecules",
        "hackett"                 : "hackett london",
        "joop"                    : "joop!",
        "mercedes-benz parfums"   : "mercedes-benz",
        "montale"                 : "montale paris",
        "pacha"                   : "pacha ibiza",
        "tiffany"                 : "tiffany & co",
        "tiffany & co."           : "tiffany & co",
        "tabac original"          : "tabac",
        "zadig&voltaire"          : "zadig & voltaire"
        }
    
    df['marca'] = df['marca'].replace(mapping)
    
    return df

df = mapear_marcas(df)

In [6]:
df = df[df['nombre'] != 'sin nombre']

In [7]:
df = df[~df['nombre'].str.contains('set ')]

In [8]:
df['nombre'] = df['nombre'].str.strip()

In [9]:
# Crear un diccionario para almacenar los nombres genéricos
lista = df['nombre'].value_counts().index.sort_values().tolist()
nombres_genericos = {}
for perfume in lista:
    # Usar expresiones regulares para buscar patrones comunes
    matches = re.findall(r'\b(?:eau de|perfume de|pour|eau de toilette\b|\beau de parfum\b)?([\w\s]+)\b', perfume, re.IGNORECASE)
    if matches:
        nombre_generico = matches[-1].strip()
        nombres_genericos[perfume] = nombre_generico
        
def mapear_nombres(df):
    
    for name, replacement in nombres_genericos.items():
        df['nombre'] = df['nombre'].str.replace(name, replacement)

    df['nombre'] = df['nombre'].str.strip()
    
    return df

df = mapear_nombres(df)

In [10]:
def mapear_nombres_manual(df):    

    mapping = {
        'eau de  perfume de mujer'    : '',
        'perfume de mujer'            : '',
        'fragancias de mujer'         : '',
        'giorgio armani'              : '',
        'yves saint laurent'          : '',
        'eau de  de mujer recargable' : '',
        'edp'                         : '',
        'edt'                         : ''
    }

    for name, replacement in mapping.items():
        df['nombre'] = df['nombre'].str.replace(name, replacement)

    df['nombre'] = df['nombre'].str.strip()

    return df

df = mapear_nombres_manual(df)

In [11]:
df.loc[df['num_valoraciones'] == 0, ['valoracion', 'num_valoraciones']] = np.nan

In [12]:
df_ordenado = df.sort_values(by= 'precio_sin_desc')

df = df_ordenado.drop_duplicates(subset= ['tienda', 'categoria', 'marca', 'nombre', 'tipo'])

In [13]:
df = df.sort_values(by= 'marca')

In [14]:
df.to_csv('datos_perfumes_totales.csv', index=False)