In [None]:
from pathlib import Path # reads paths in the current OS
import pandas as pd
import numpy as np
import utils as ut
import yaml
import json

## Reading

In [None]:
with open(Path('conf') / 'paths.yaml') as file:
    config = yaml.full_load(file)
with open(Path('conf') / 'product_list.yaml') as file:
    product_cat = yaml.full_load(file)
with open(Path('conf') / 'payment_list.yaml') as file:
    payment_cat = yaml.full_load(file)
with open(Path('conf') / 'sector_list.yaml') as file:
    sectors_dict = yaml.full_load(file)
    

data         = pd.read_json(Path(config['input_path']) / 'db_mesinfo.json', orient='index').fillna('')
locations_df = pd.read_csv(Path(config['input_path']) / 'municipis_merge.csv').fillna('')

print(data.shape)

data.rename(columns={'url':'URL', 
                     'Nom de la persona productora:':'PRODUCTOR', 
                     'Marca:':'MARCA', 
                     'Municipi:':'MUNICIPIO',  
                     'On serveix:':'DONDE', 
                     'Productes disponibles:':'PRODUCTOS', 
                     'Altres productes alimentaris:':'OTROS',
                     'Possibilitats pagament:': 'PAGO', 
                     'Fruita*:':'FRUTA', 
                     '*':'NOTAS', 
                     'Més informació:':'INFO', 
                     'Carn:':'CARNE', 
                     'Verdura*:': 'VERDURA', 
                     'Flor i planta ornamental:':'FLORES'}, inplace=True)

data.head()

In [None]:
stopwords = ut.get_all_stopwords()

## Pre-process columns

In [None]:
ut.run_preprocess_on_cols(data,['OTROS', 'PAGO'],stopwords)

## New columns

### Binary varibles

In [None]:
# create binary variables representing whether they have a payment method or not
ut.create_binary_var(data,payment_cat,'PAGO'+'_prep')
# create binary variables representing whether they have a type of product or not
ut.create_binary_var(data,product_cat,'OTROS'+'_prep')

## Updating the columns vegetables, fruit, meat and flowers, 
## since in this dataset are informed in different fields (and not in the main one OTROS)
data.loc[data.VERDURA != '', 'vegetables']=1
data.loc[data.FRUTA != '', 'fruit']=1
data.loc[data.CARNE != '', 'meat']=1
data.loc[data.FLORES != '', 'flowers']=1

### Sectors


In [None]:
# Create binary variables representing whether the producer belong to a specific sector or not 
# (according to the products he/she sells and the definition of sectors given in sector_list.yaml)
data=ut.create_sectors_col(data,sectors_dict)

### Numerical cols

In [None]:
data = ut.add_numerical_cols(data,more_data=True)

## 42 comarcas en total:   
    
#Alt Camp, Alt Empordà, Alt Penedès, Alt Urgell, Alta Ribagorça, 
#Anoia, Bages,Baix Camp, Baix Ebre, Baix Empordà,
#Baix Llobregat, Baix Penedès, Barcelonès, Berguedà, Cerdanya, 
#Conca de Barberà, Garraf, Garrigues, Garrotxa, Gironès,
#Maresme, Moianès, Montsià, Noguera, Osona / Lluçanès, 
#Pallars Jussà, Pallars Sobirà, Pla d'Urgell, Pla de l'Estany, Priorat, 
#Ribera d'Ebre, Ripollès, Segarra, Segrià, Selva,
#Solsonès, Tarragonès, Terra Alta, Urgell, Vall d'Aran, 
#Vallès Occidental,  Vallès Oriental

##NB: Hay algunos casos donde hay 41 comarcas, más tarde los consideremos como toda cataluña

### Comarca of origin (i.e. of the producer)

In [None]:
# Dictionary to translate municipis to comarca
mun_to_com_dict = locations_df[locations_df['Municipi']!=''].set_index('Municipi')['Comarca'].to_dict()

In [None]:
data['comarca_origin'] = data['MUNICIPIO'].str.split(')').str.get(-2).str.split('(').str.get(1).fillna('')

data['comarca_origin'] = data['comarca_origin'].apply(lambda x: ut.check_comarca_spelling(
    x,locations_df['Comarca'],stopwords) if x not in locations_df['Comarca'] else x)

## Save

In [None]:
data.drop(['PAGO_prep','OTROS_prep'],axis=1).to_csv(Path(config['input_path']) / 'pagesos_clean.csv', index=False)