## Imports

In [None]:
from pathlib import Path # reads paths in the current OS
import pandas as pd
import numpy as np
import yaml
import utils as ut

## Reading

In [None]:
with open(Path('conf') / 'paths.yaml') as file:
    config = yaml.full_load(file)
with open(Path('conf') / 'com_comprar_kw.yaml') as file:
    com_comprar = yaml.full_load(file)
with open(Path('conf') / 'product_list.yaml') as file:
    product_cat = yaml.full_load(file)
with open(Path('conf') / 'payment_list.yaml') as file:
    payment_cat = yaml.full_load(file)   
with open(Path('conf') / 'sector_list.yaml') as file:
    sectors_dict = yaml.full_load(file)
    
data         = pd.read_csv(Path(config['input_path']) / "abastiment.csv", sep=",").fillna('')
locations_df = pd.read_csv(Path(config['input_path']) /  'municipis_merge.csv').fillna('')

print(data.shape)

data.head()

In [None]:
data_gen = pd.read_csv(Path(config['input_path']
                           ) / 'Productors_adherits_a_la_venda_de_proximitat.csv').fillna('')
data_gen.rename(columns={'Marca Comercial':'MARCA'},inplace=True)
print(data_gen.shape)
data_gen.head()

In [None]:
stopwords = ut.get_all_stopwords()

In [None]:
delivery_patt = ['repart', 'domicil', 'envi', 'recoll', 'dist']

In [None]:
imp_cols = ['COM COMPRAR', 'OBSERVACIONS','PRODUCTE(S)']

In [None]:
com_typos = {
    'Al Urgell':'Alt Urgell',
    'Bages-Moianès':'Moianès',
    'Moianes-Bages':'Moianès',
    'Barcelona':'Barcelonès',
    'Maresme-Barcelonès':'Maresme',
    'Tarragona':'Tarragonès',
    'Baix Montseny':'Vallès Oriental',
    'Baixa Cerdanya':'Cerdanya',
    'Vall Aran':"Vall d'Aran",
    'Alt Maresme':'Maresme',
    'Penedès':'Alt Penedès',
    "Val D'Aran": "Vall d'Aran",
    'Lluçanès':'Osona', #should we consider it a comarca?
            }

## Pre-process columns

In [None]:
data_gen['comarca_origin'] = data_gen['Comarca'].str.title().replace(com_typos)
data_gen['comarca_origin'] = data_gen['comarca_origin'].apply(lambda x: ut.check_comarca_spelling(
    x,locations_df['Comarca'],stopwords) if x not in locations_df['Comarca'] else x)

In [None]:
data['comarca_origin'] = data['COMARCA'].replace(com_typos)
data['comarca_origin'] = data['comarca_origin'].apply(lambda x: ut.check_comarca_spelling(
    x,locations_df['Comarca'],stopwords) if x not in locations_df['Comarca'] else x)

In [None]:
ut.run_preprocess_on_cols(data_gen,['Productes','Grups Productes'],stopwords)

In [None]:
ut.run_preprocess_on_cols(data,imp_cols+['comarca_origin'],stopwords)
ut.run_preprocess_on_cols(locations_df,['Municipi', 'Comarca', 'Capital', 'Provincia'],stopwords)

## New columns

### Payment

In [None]:
data = ut.get_payment_methods(data,imp_cols)

### Locations

In [None]:
# Dictionary to translate municipis to comarca
mun_to_com_dict = locations_df[locations_df['Municipi']!=''].set_index('Municipi')['Comarca'].to_dict()

In [None]:
ut.run_text_locations(data, locations_df, imp_cols, delivery_patt)

In [None]:
data = ut.create_donde_col(data,mun_to_com_dict)

### Binary variables

In [None]:
ut.create_binary_var(data_gen,product_cat,'Productes'+'_prep')

In [None]:
# create binary variables representing whether they have a payment method or not
ut.create_binary_var(data,payment_cat,'PAGO')
# create binary variables representing whether they have a type of product (1) or not
ut.create_binary_var(data,product_cat,'PRODUCTE(S)'+'_prep')
# create binary variables representing whether they have a type payment method, contact info... (1) or not
ut.create_binary_var(data,com_comprar,'COM COMPRAR'+'_prep')

# improve the website and social network searches adding another column
data.loc[(data['web']!=1) & (data['OBSERVACIONS'+'_prep'].str.contains(
    r'\b'+r'\b|\b'.join(com_comprar['web'])+r'\b')),'web'] = 1
data.loc[(data['socialnet']!=1) & (data['OBSERVACIONS'+'_prep'].str.contains(
    r'\b'+r'\b|\b'.join(com_comprar['socialnet'])+r'\b')),'socialnet'] = 1
# improve iseco category
data.loc[(data['iseco'] == 0) & (data['CCPAE'].isin(['Sí','En conversió'])),'iseco'] = 1

### Sectors

In [None]:
# Create binary variables representing whether the producer belong to a specific sector or not 
# (according to the products he/she sells and the definition of sectors given in sector_list.yaml)
data_gen=ut.create_sectors_col(data_gen,sectors_dict)
data=ut.create_sectors_col(data,sectors_dict)

### Numerical columns

In [None]:
data_gen = ut.add_numerical_cols(data_gen,more_data=False)

In [None]:
data = ut.add_numerical_cols(data,more_data=True)
data.loc[data['n_paym_methods']==0,'n_paym_methods'] = np.nan

## Save

In [None]:
data_gen.drop(['Productes_prep', 'Grups Productes_prep'],axis=1
             ).to_csv(Path(config['input_path']) / 'vdp_clean.csv', index=False)

In [None]:
data.drop(['PRODUCTE(S)_prep','OBSERVACIONS_prep','COM COMPRAR_prep'],axis=1
         ).to_csv(Path(config['input_path']) / 'abastiment_clean.csv', index=False)