In [153]:
import pandas as pd
import numpy as np

In [154]:
datos = pd.read_csv('lines_final2.csv', sep=';') #archivo con las categorias unicas

In [155]:
datos.shape

(4903, 2)

In [156]:
datos.head(1)

Unnamed: 0,categories,lines
0,Military airport,Automotive


In [157]:
datos.lines.value_counts() #veo como se distribuyen las categorias unicas entre las lines

lines
Health and Hospitals                                  498
Shopping                                              489
Restaurants                                           440
Sports                                                377
Automotive                                            355
Professional Services                                 308
Entertainment                                         300
Education and Learning                                283
Health and Beauty                                     270
Tourism                                               249
Rental Services                                       244
Arts and Crafts                                       219
Veterinary and Pets                                   212
Industry                                              211
Other                                                 159
Gardening and Home Services                           151
Technology, Networks, Electronics, and Engineering    104
Events a

In [158]:
#Creo un diccionario con lines y category
categories_by_line = {}

# Iterar a través del DataFrame para llenar el diccionario
for index, row in datos.iterrows():
    line = row['lines']
    category = row['categories']
    
    if line in categories_by_line:
        categories_by_line[line].append(category)
    else:
        categories_by_line[line] = [category]

In [159]:
keys_list = list(categories_by_line.keys()) #una lista de los keys del diccionario

#Aplico a cada dataset de estados:

In [160]:
dataset = pd.read_parquet('TexasFinalCorregido.parquet')

Creo la función para que pueda trabajar en forma de ciclo

In [161]:
def evaluate_lines(row, key, categories_by_line):
    category_list = row['category']
    
    if category_list is None or len(category_list) == 0:
        return None
    elif len(category_list) == 1 and category_list[0] in categories_by_line[key]:
        return 1
    else:
        count = sum(1 for category in category_list if category in categories_by_line[key])
        return 1 if count >= 2 else 0

In [162]:
for key in keys_list:
    dataset[key] = dataset.apply(evaluate_lines, args=(key, categories_by_line), axis=1)


In [163]:
dataset['sumatoria'] = dataset[keys_list].sum(axis=1) #Creo una sumatoria de todas las columnas cargadas
#esto es para ver si todos los dataset estan siendo categorizados a al menos 1 linea.

In [164]:
dataset.sumatoria.value_counts() #Veo la distribución por cantidad de lineas asignadas a cada negocio.

sumatoria
1.0    58732
0.0     5368
2.0     3078
3.0      457
4.0        9
Name: count, dtype: int64

In [165]:
dataset.shape #Shape del dataset para asegurar que todas las transformaciones se hagan correctamente.

(67644, 29)

Cambio un poco la funcion para los que no estan siendo clasificados. Hago un repaso.

In [166]:
#funcion de repaso para los ceros
def evaluate_lines_repaso(row, key, categories_by_line):
    category_list = row['category']
    
    if category_list is None or len(category_list) == 0:
        return None
    else:
        count = sum(1 for category in category_list if category in categories_by_line[key])
        return 1 if count >= 1 else 0

In [167]:
zeros = dataset[dataset['sumatoria']==0].copy() #divido en dos partes una con los ceros y el complemento.

In [168]:
dataset = dataset[dataset['sumatoria']!=0]

In [169]:
zeros.shape[0] + dataset.shape[0] #este numero debe ser igual al del dataset original

67644

In [170]:
#Elimino las columnas creadas por la funcion principal para que sean creadas nuevamente en el repaso.
zeros = zeros.drop(['Automotive', 'Shopping',
       'Health and Hospitals', 'Rental Services', 'Health and Beauty',
       'Entertainment', 'Restaurants', 'Tourism', 'Veterinary and Pets',
       'Education and Learning', 'Sports', 'Arts and Crafts',
       'Events and Weddings', 'Industry',
       'Technology, Networks, Electronics, and Engineering',
       'Professional Services', 'Gardening and Home Services', 'Other',
       'sumatoria'],axis=1)

In [171]:
#Aplico la función de repaso.
for key in keys_list:
    zeros[key] = zeros.apply(evaluate_lines_repaso, args=(key, categories_by_line), axis=1)

In [172]:
#Vuelvo a crear la columna sumatoria.
zeros['sumatoria'] = zeros[keys_list].sum(axis=1)

In [173]:
#Chequeo la distribución. Los 0.0 que siguen saliendo corresponden a los valores NoNe de la columna category
zeros.sumatoria.value_counts()

sumatoria
2.0    4182
3.0     927
4.0     176
0.0      58
5.0      20
6.0       4
1.0       1
Name: count, dtype: int64

#Uno ambas partes del dataframe original

In [174]:
dataset_final = pd.concat([zeros, dataset], ignore_index=True, axis=0)

In [175]:
dataset_final.shape

(67644, 29)

dataset_final.columns

In [176]:
dataset_final.columns

Index(['identificador', 'name', 'address', 'category', 'latitude', 'longitude',
       'avg_rating', 'hours', 'reviews', 'source', 'Automotive', 'Shopping',
       'Health and Hospitals', 'Rental Services', 'Health and Beauty',
       'Entertainment', 'Restaurants', 'Tourism', 'Veterinary and Pets',
       'Education and Learning', 'Sports', 'Arts and Crafts',
       'Events and Weddings', 'Industry',
       'Technology, Networks, Electronics, and Engineering',
       'Professional Services', 'Gardening and Home Services', 'Other',
       'sumatoria'],
      dtype='object')

#vamos a crear la columna line vacia y luego le agrego el valor que corresponda.

In [177]:
dataset_final['line'] = ''

In [178]:
columnas = ['Automotive', 'Shopping',
       'Health and Hospitals', 'Rental Services', 'Health and Beauty',
       'Entertainment', 'Restaurants', 'Tourism', 'Veterinary and Pets',
       'Education and Learning', 'Sports', 'Arts and Crafts',
       'Events and Weddings', 'Industry',
       'Technology, Networks, Electronics, and Engineering',
       'Professional Services', 'Gardening and Home Services', 'Other']

In [179]:
#Filtro los que en sumatoria tienen 1 y les agrego el valor en el que la columna tiene 1:
filtered_rows = dataset_final[dataset_final['sumatoria'] == 1]

for index, row in filtered_rows.iterrows():
    categories_with_one = row[columnas].index[row[columnas] == 1]
    categories_with_one = ", ".join(categories_with_one)
    dataset_final.at[index, 'line'] = categories_with_one

In [180]:
dataset_final.line.value_counts()

line
Health and Beauty                                     9110
                                                      8911
Automotive                                            8811
Shopping                                              8560
Restaurants                                           7370
Health and Hospitals                                  5246
Rental Services                                       5002
Professional Services                                 2714
Veterinary and Pets                                   2436
Sports                                                2256
Tourism                                               1864
Education and Learning                                1076
Entertainment                                         1018
Gardening and Home Services                            726
Other                                                  642
Arts and Crafts                                        626
Events and Weddings                                

In [181]:
#Filtro los que en sumatoria tienen 0 y los agrego a Other
dataset_final.loc[dataset_final['sumatoria'] == 0, 'line'] = 'Other'

In [182]:
dataset_final.line.value_counts()

line
Health and Beauty                                     9110
                                                      8853
Automotive                                            8811
Shopping                                              8560
Restaurants                                           7370
Health and Hospitals                                  5246
Rental Services                                       5002
Professional Services                                 2714
Veterinary and Pets                                   2436
Sports                                                2256
Tourism                                               1864
Education and Learning                                1076
Entertainment                                         1018
Gardening and Home Services                            726
Other                                                  700
Arts and Crafts                                        626
Events and Weddings                                

In [183]:
#Los que tengan mas de una line.
filtered_rows = dataset_final[dataset_final['sumatoria'] > 1]

# Iterar a través de las filas filtradas y agregar el nombre de la primera columna con valor 1 a 'line'
for index, row in filtered_rows.iterrows():
    categories_with_one = row[columnas].index[row[columnas] == 1]
    first_category_with_one = categories_with_one[0] if len(categories_with_one) > 0 else ''
    dataset_final.at[index, 'line'] = first_category_with_one

In [184]:
dataset_final.line.value_counts()

line
Shopping                                              11289
Automotive                                            10404
Health and Beauty                                      9292
Restaurants                                            7496
Health and Hospitals                                   7241
Rental Services                                        5470
Veterinary and Pets                                    3071
Professional Services                                  2907
Sports                                                 2425
Tourism                                                1981
Entertainment                                          1336
Education and Learning                                 1186
Gardening and Home Services                             739
Other                                                   700
Arts and Crafts                                         679
Industry                                                548
Events and Weddings                

#reordeno las columnas

In [185]:
columnas = ['identificador', 'name', 'address', 'category', 'latitude', 'longitude',
       'avg_rating', 'hours', 'reviews', 'source', 'line', 'Automotive', 'Shopping',
       'Health and Hospitals', 'Rental Services', 'Health and Beauty',
       'Entertainment', 'Restaurants', 'Tourism', 'Veterinary and Pets',
       'Education and Learning', 'Sports', 'Arts and Crafts',
       'Events and Weddings', 'Industry',
       'Technology, Networks, Electronics, and Engineering',
       'Professional Services', 'Gardening and Home Services', 'Other',
       'sumatoria']

In [186]:
dataset_final = dataset_final[columnas]

In [187]:
dataset_final.shape

(67644, 30)

#Finalmente lo guardo como parquet

In [188]:
dataset_final.to_parquet('texasconRUBRO.parquet', index=False)