# Limpieza de Datos Locales 20190712

In [2]:
import pandas as pd

# Importar Datasets

In [39]:
locales_col_names = [
    'id_cadena', 'desc_cadena', 'id_region', 'desc_region', 'idb',
    'desc_local', 'latitud', 'longitud', 'punto_dist'
]

In [40]:
locales_df = pd.read_csv("../data/Locales.txt",
                         encoding="UTF-8",
                         header=None,
                         delimiter='\t',
                         names=locales_col_names)

In [41]:
locales_df.head(40)


Unnamed: 0,id_cadena,desc_cadena,id_region,desc_region,idb,desc_local,latitud,longitud,punto_dist
0,1,Plaza Vea,1,Buenos Aires,9693,9693 - P Vea Villa Urquiza.,,,N
1,1,Plaza Vea,1,Buenos Aires,9694,9694 - P Vea JB Justo.,,,N
2,1,Plaza Vea,1,Buenos Aires,9695,695 - P Vea Acoyte.,0.0,0.0,N
3,1,Plaza Vea,1,Buenos Aires,9699,699 - P Vea Rivadavia.,0.0,0.0,N
4,1,Plaza Vea,1,Buenos Aires,9889,889 - Plaza Vea Carpa de liqui,,,N
5,2,Disco,1,Buenos Aires,247,916 - SM 916 Gorostiaga,-34.565259,-58.436583,N
6,2,Disco,1,Buenos Aires,257,61 - Tortuguitas,-34.446043,-58.745098,N
7,2,Disco,1,Buenos Aires,285,953 - Venado Tuerto,,,N
8,2,Disco,1,Buenos Aires,740,740 - Canning,-34.853221,-58.502247,N
9,2,Disco,1,Buenos Aires,961,961 - Av. Santa FÇ,,,N


In [42]:
locales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id_cadena    175 non-null    int64  
 1   desc_cadena  175 non-null    object 
 2   id_region    175 non-null    int64  
 3   desc_region  175 non-null    object 
 4   idb          175 non-null    int64  
 5   desc_local   175 non-null    object 
 6   latitud      140 non-null    float64
 7   longitud     140 non-null    float64
 8   punto_dist   175 non-null    object 
dtypes: float64(2), int64(3), object(4)
memory usage: 12.4+ KB


#### Ajustes al dataframe

- Limpiar la descripcion de la región, eliminando el numero y el simbolo (-).
- Convertir a minusculas todos los tipos de datos object o string.
- Remplazar caracteres especiales con espacio.
- Verificar si existen valores nulos o duplicados.
- Convertir a 1 y 0 la columna punto de distribucción con la siguiente regla si el valor es 's' = 1 que significa que es un punto de distribucción de lo contrario sería 0.
- Crear una nueva columna llamada local usando la columna punto de distribucción.


In [43]:
locales_df = locales_df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [44]:
locales_df['desc_local'] = locales_df['desc_local'].replace(to_replace=r'^\d+\s*-\s*', value='', regex=True)

In [45]:
# Ajustar los strings con errores ortograficos
locales_df['desc_local'] = locales_df['desc_local'].replace(to_replace=r'f', value='fé', regex=True)
locales_df['desc_local'] = locales_df['desc_local'].replace(to_replace=r'crdoba', value='cordoba', regex=True)
locales_df['desc_local'] = locales_df['desc_local'].replace(to_replace=r'fééernando', value='fernando', regex=True)
locales_df['desc_local'] = locales_df['desc_local'].replace(to_replace=r'\bjumbo\b', value='', regex=True)
locales_df['desc_local'] = locales_df['desc_local'].replace(to_replace=r'\brincn\b', value='', regex=True)
locales_df['desc_local'] = locales_df['desc_local'].replace(to_replace=r'\bplaza vea\b', value='', regex=True)
locales_df['desc_local'] = locales_df['desc_local'].replace(to_replace=r'\bp vea\b', value='', regex=True)

In [46]:
locales_df['desc_local'] = locales_df['desc_local'].replace(to_replace=r'[^\x00-\x7F]+', value='', regex=True)

In [47]:
print(locales_df.isna().sum())

id_cadena       0
desc_cadena     0
id_region       0
desc_region     0
idb             0
desc_local      0
latitud        35
longitud       35
punto_dist      0
dtype: int64


In [48]:
locales_df.isnull().sum(axis=0)

id_cadena       0
desc_cadena     0
id_region       0
desc_region     0
idb             0
desc_local      0
latitud        35
longitud       35
punto_dist      0
dtype: int64

In [49]:
# Conteo de los niveles en las diferentes columnas categóricas
col_categoricas = ['desc_cadena', 'desc_region', 'desc_local']
for col in col_categoricas:
    print(f'Columna {col}: {locales_df[col].nunique()} subnivele(s)')

Columna desc_cadena: 5 subnivele(s)
Columna desc_region: 1 subnivele(s)
Columna desc_local: 175 subnivele(s)


In [50]:
# Convertir la columna categorica punto_dist a numerica, si el valor es s significa que es un punto de distribucción y se asignará el valor de 1, de lo contrario será 0.
punto_dist_mapping = {"s": 1, "n": 0}
locales_df['punto_dist'] = locales_df['punto_dist'].map(punto_dist_mapping)

In [51]:
# Crear una columna para determinar si es un local con base en la columna punto_dist, si el local no es un punto de distribuccion entonces es un local.  Asignaremos 1 para local 0 para no local.
locales_df['local'] = locales_df['punto_dist'].apply(lambda x: 1 if x == 0 else 0)

In [59]:
locales_df.head(200)

Unnamed: 0,id_cadena,desc_cadena,id_region,desc_region,idb,desc_local,latitud,longitud,punto_dist,local
2,1,plaza vea,1,buenos aires,9695,acoyte.,0.000000,0.000000,0,1
3,1,plaza vea,1,buenos aires,9699,rivadavia.,0.000000,0.000000,0,1
5,2,disco,1,buenos aires,247,sm 916 gorostiaga,-34.565259,-58.436583,0,1
6,2,disco,1,buenos aires,257,tortuguitas,-34.446043,-58.745098,0,1
8,2,disco,1,buenos aires,740,canning,-34.853221,-58.502247,0,1
...,...,...,...,...,...,...,...,...,...,...
160,9,jumbo,1,buenos aires,5961,av. santa f,-34.577317,-58.428886,0,1
163,9,jumbo,1,buenos aires,9628,san fernando,-34.446703,-58.545037,0,1
167,11,rincon jumbo,1,buenos aires,8262,rincn unicenter ii,0.000000,0.000000,0,1
168,11,rincon jumbo,1,buenos aires,8302,rincn unicenter,0.000000,0.000000,0,1


In [53]:
locales_df.describe()

Unnamed: 0,id_cadena,id_region,idb,latitud,longitud,punto_dist,local
count,175.0,175.0,175.0,140.0,140.0,175.0,175.0
mean,4.502857,1.0,6909.617143,-31.533832,-52.839588,0.034286,0.965714
std,2.945627,0.0,3421.539461,10.718891,17.724343,0.182484,0.182484
min,1.0,1.0,242.0,-45.872649,-67.496036,0.0,0.0
25%,2.0,1.0,5213.5,-34.730633,-58.632039,0.0,1.0
50%,4.0,1.0,9019.0,-34.601654,-58.434434,0.0,1.0
75%,4.0,1.0,9158.5,-34.546212,-58.251974,0.0,1.0
max,11.0,1.0,9977.0,0.0,0.0,1.0,1.0


#### Observaciones




In [54]:
locales_df['latitud'].dropna(inplace=True)
locales_df['longitud'].dropna(inplace=True)

In [56]:
locales_df = locales_df[~(pd.isna(locales_df['latitud']) | pd.isna(locales_df['longitud']))]
locales_df


Unnamed: 0,id_cadena,desc_cadena,id_region,desc_region,idb,desc_local,latitud,longitud,punto_dist,local
2,1,plaza vea,1,buenos aires,9695,acoyte.,0.000000,0.000000,0,1
3,1,plaza vea,1,buenos aires,9699,rivadavia.,0.000000,0.000000,0,1
5,2,disco,1,buenos aires,247,sm 916 gorostiaga,-34.565259,-58.436583,0,1
6,2,disco,1,buenos aires,257,tortuguitas,-34.446043,-58.745098,0,1
8,2,disco,1,buenos aires,740,canning,-34.853221,-58.502247,0,1
...,...,...,...,...,...,...,...,...,...,...
160,9,jumbo,1,buenos aires,5961,av. santa f,-34.577317,-58.428886,0,1
163,9,jumbo,1,buenos aires,9628,san fernando,-34.446703,-58.545037,0,1
167,11,rincon jumbo,1,buenos aires,8262,rincn unicenter ii,0.000000,0.000000,0,1
168,11,rincon jumbo,1,buenos aires,8302,rincn unicenter,0.000000,0.000000,0,1


In [57]:
locales_df.head()

Unnamed: 0,id_cadena,desc_cadena,id_region,desc_region,idb,desc_local,latitud,longitud,punto_dist,local
2,1,plaza vea,1,buenos aires,9695,acoyte.,0.0,0.0,0,1
3,1,plaza vea,1,buenos aires,9699,rivadavia.,0.0,0.0,0,1
5,2,disco,1,buenos aires,247,sm 916 gorostiaga,-34.565259,-58.436583,0,1
6,2,disco,1,buenos aires,257,tortuguitas,-34.446043,-58.745098,0,1
8,2,disco,1,buenos aires,740,canning,-34.853221,-58.502247,0,1


In [62]:
locales_df[locales_df['latitud']>=0.0]

Unnamed: 0,id_cadena,desc_cadena,id_region,desc_region,idb,desc_local,latitud,longitud,punto_dist,local
2,1,plaza vea,1,buenos aires,9695,acoyte.,0.0,0.0,0,1
3,1,plaza vea,1,buenos aires,9699,rivadavia.,0.0,0.0,0,1
16,2,disco,1,buenos aires,9007,pte. pern.,0.0,0.0,0,1
55,2,disco,1,buenos aires,9205,carniceria central.,0.0,0.0,0,1
63,2,disco,1,buenos aires,9639,pinocho vii.,0.0,0.0,0,1
70,4,vea,1,buenos aires,252,miramar,0.0,0.0,0,1
80,4,vea,1,buenos aires,674,resistencia,0.0,0.0,0,1
134,9,jumbo,1,buenos aires,305,deposito cbn group,0.0,0.0,1,0
153,9,jumbo,1,buenos aires,5276,arenales,0.0,0.0,0,1
155,9,jumbo,1,buenos aires,5299,depsito tortuguitas,0.0,0.0,1,0


In [58]:
import folium

# Combine the latitude and longitude columns into a list of locations
locations = [[lat, lon] for lat, lon in zip(locales_df['latitud'], locales_df['longitud'])]

# Create a map centered at the first location in the list
m = folium.Map(location=locations[0], zoom_start=12)

# Add markers to the map for each location in the list
for location in locations:
    folium.Marker(location=location).add_to(m)

# Display the map
m



#### Guardar el dataset limpio

In [18]:
locales_df.to_csv('../clean_data/locales_clean.csv', index=False)