## Imports

In [None]:
from pathlib import Path # reads paths in the current OS
import unicodedata
import pandas as pd
import numpy as np
import json
import re
import yaml
import utils as ut
from stop_words import get_stop_words
from unidecode import unidecode

## Reading

In [None]:
with open('paths.yaml') as file:
    config = yaml.full_load(file)

data         = pd.read_csv(Path(config['input_path']) / "abastiment.csv", sep=",").fillna('')
locations_df = pd.read_csv(Path(config['input_path']) /  'municipis_merge.csv').fillna('')

with open('com_comprar_kw.yaml') as file:
    com_comprar = yaml.full_load(file)
with open('product_list.yaml') as file:
    product_cat = yaml.full_load(file)

In [None]:
data.head()

In [None]:
locations_df.head()

In [None]:
#load a set of stop words
stopwords = get_stop_words('catalan')
#add new stopwords
newStopWords = ['que','des', 'al', 'del', 'ho', 'd', 'l','per','tambe', 'fins',
               'a', 'cap', 'hi', 'ni', 'no']
stopwords.extend(newStopWords)

In [None]:
delivery_patt = ['repart', 'domicil', 'envi', 'recoll', 'dist']

In [None]:
# comarca_new is necessary because it contains some "repartim a..." text
cols_to_extract_locs = ['COM COMPRAR', 'OBSERVACIONS', 'comarca_new']

In [None]:
com_typos = {
    'Al Urgell':'Alt Urgell',
    'Bages-Moianès':'Moianès',
    'Moianes-Bages':'Moianès',
    'Barcelona':'Barcelonès',
    'Maresme-Barcelonès':'Maresme',
    'Tarragona':'Tarragonès',
    'Baix Montseny':'Vallès Oriental',
    'Baixa Cerdanya':'Cerdanya',
    'Vall Aran':"Vall d'Aran",
    'Alt Maresme':'Maresme',
    'Penedès':'Alt Penedès',
    'Lluçanès':'Osona', #should we consider it a comarca?
            }

## Pre-process columns

In [None]:
data['comarca_new'] = data['COMARCA'].replace(com_typos)

In [None]:
data['comarca_new'] = data['comarca_new'].apply(lambda x: ut.check_comarca_spelling(
    x,locations_df['Comarca'],stopwords) if x not in locations_df['Comarca'] else x)

In [None]:
## Creating the field 'comarca_origin': a clean version of 'COMARCA'
data=pd.merge(data,locations_df[['Municipi', 'Comarca']], how='left', left_on='MUNICIPI', right_on='Municipi')

data['comarca_origin']=data.comarca_new
data.loc[data.comarca_origin.str.contains('NOTFOUND'), 'comarca_origin']=data.Comarca
data.drop(['Municipi', 'Comarca'],axis=1,inplace=True)

##TODO: improve this part using the dict municipio to comarca...

In [None]:
#data[data.COMARCA=='Repartim al Bages, Solsonès, Barcelonès i Berguedà'].head()

In [None]:
for col in cols_to_extract_locs+['PRODUCTE(S)']:
    data[col+'_prep'] = data[col].apply(lambda x: ut.pre_process(x, stopwords,sw=True))

for col in ['Municipi', 'Comarca', 'Capital', 'Provincia']:
    locations_df[col+'_prep'] = locations_df[col].apply(lambda x: ut.pre_process(x, stopwords,sw=True))

## New columns

### Locations

In [None]:
for data_field in cols_to_extract_locs:
    for loc in ['Comarca','Capital','Provincia','Municipi']:
        # obtain the locations from the free text fields
        ut.get_text_locations(data, loc.lower(),data_field,locations_df,loc,delivery_patt)

In [None]:
# Dictionary to translate municipis to comarca
mun_to_com_dict = locations_df[locations_df['Municipi']!=''].set_index('Municipi')['Comarca'].to_dict()

In [None]:
data[['capital','municipi']] = data[['capital','municipi']].replace(mun_to_com_dict,regex=True)
data['all_comarques']        = (data['capital']+','+data['municipi']+','+data['comarca']
                               ).str.strip(',').str.split(',')
data.drop(['capital','municipi', 'comarca_new_prep','comarca_new','comarca'],axis=1,inplace=True)

In [None]:
data['all_comarques'] = data['all_comarques'].apply(lambda x: ','.join(set(x)))
data['all_comarques'] = data['all_comarques'].str.replace(r'\bCatalunya\b','Tota Catalunya')

### Number of delivery regions

In [None]:
# Creating variable about number of comarcas where they deliver:
data['n_comarcas_delivery']=data['all_comarques'].apply(lambda x: x.count(',')+1 if 'Catalunya' not in x else 42)


### Binary variables

In [None]:
# create binary variables representing whether they have a type of product (1) or not
for key, val in product_cat.items():
    data[key]=0
    data.loc[data['PRODUCTE(S)'+'_prep'].str.contains(r'\b'+r'\b|\b'.join(val)+r'\b'),key] = 1
    
# create binary variables representing whether they have a type payment method, contact info... (1) or not
for key, val in com_comprar.items():
    data[key]=0
    data.loc[data['COM COMPRAR'+'_prep'].str.contains(r'\b'+r'\b|\b'.join(val)+r'\b'),key] = 1
# improve the website and social network searches adding another column
data.loc[(data['web']!=1) & (data['OBSERVACIONS'+'_prep'].str.contains(
    r'\b'+r'\b|\b'.join(com_comprar['web'])+r'\b')),'web'] = 1
data.loc[(data['socialnet']!=1) & (data['OBSERVACIONS'+'_prep'].str.contains(
    r'\b'+r'\b|\b'.join(com_comprar['socialnet'])+r'\b')),'socialnet'] = 1

### Number of type of products

In [None]:
# Creating variables about number of products sold:
data['n_main_prod'] = data['meat'] + data['fruit'] + data['vegetables']
data['n_other_prod'] = data['flowers'] + data['charcuterie'] + data['legumes'] + data['mushrooms'] + data['rice'] +\
    data['flour_cereals'] + data['oil_olives_vinager'] + data['eggs'] + data['dairies'] +\
    data['herbs_spices'] + data['hygiene_medicines'] + data['alcohol'] +\
    data['fruit_veggies_products'] + data['drinks'] + data['bread_pastries'] +\
    data['pasta'] + data['others']
data['n_tot_prod'] = data['n_main_prod'] + data['n_other_prod']

## Save

In [None]:
data.drop(['PRODUCTE(S)_prep','OBSERVACIONS_prep','COM COMPRAR_prep'],axis=1
         ).to_csv(Path(config['input_path']) / 'abastiment_new.csv', index=False)

In [None]:
print(data.columns)
data.head()