## Imports

In [1]:
from pathlib import Path # reads paths in the current OS
import unicodedata
import pandas as pd
import numpy as np
import json
import re
import yaml
import utils as ut
from stop_words import get_stop_words
from unidecode import unidecode



## Reading

In [2]:
with open('paths.yaml') as file:
    config = yaml.full_load(file)

data         = pd.read_csv(Path(config['input_path']) / "abastiment.csv", sep=",").fillna('')
locations_df = pd.read_csv(Path(config['input_path']) /  'municipis_merge.csv').fillna('')

with open('com_comprar_kw.yaml') as file:
    com_comprar = yaml.full_load(file)
with open('product_list.yaml') as file:
    product_cat = yaml.full_load(file)
with open('payment_list.yaml') as file:
    payment_cat = yaml.full_load(file)

In [3]:
data.head()

Unnamed: 0,PROJECTE,URGENT,PRODUCTE(S),MUNICIPI,COM COMPRAR,OBSERVACIONS,CCPAE,COMARCA
0,3 Fermentats I Un Destí,Sí,1 Pa de Massa Mare Natural de 2kg 2 Vins Natur...,Torelló,Ho repartim a domicili per tot Osona resta de ...,Ens fa il.lusió poder colaborar per oferir-vos...,Sí,Osona
1,30 Cabres,,Formatges de llet crua de cabres que pasturen ...,Bellver de Cerdanya,"Contacta per telefon, mail o xarxes socials (@...",Tots els nostres formatges estan elaborats amb...,,Cerdanya
2,7 de Ribera Hostal-Agrobotiga,Sí,Verdures de producció propia,Móra d'Ebre,A l'agrobotiga dijous i divendres de 10:00 a 1...,www.biosferacomestible.cat,,Ribera d’Ebre
3,A Granel,,"Cosmètica, productes d’higiène i neteja ecològics",Barcelona,Venda directa i en punts de recollida per tota...,,Sí,Barcelona
4,AgroMontserratí - Cooperativa del Montserratí,,"Fruita, verdura, transformats, elaborats, pins...",Olesa de Montserrat,A través del Whatsap i Telèfon - XXXXXXXXX A l...,www.agromontserrati.cat,,Baix Llobregat


In [4]:
locations_df.head()

Unnamed: 0,Municipi,Comarca,Capital,Provincia
0,Abella de la Conca,Pallars Jussà,Tremp,Lleida
1,Castell de Mur,Pallars Jussà,Tremp,Lleida
2,Conca de Dalt,Pallars Jussà,Tremp,Lleida
3,Gavet de la Conca,Pallars Jussà,Tremp,Lleida
4,Isona i Conca Dellà,Pallars Jussà,Tremp,Lleida


In [5]:
#load a set of stop words
stopwords = get_stop_words('catalan')
#add new stopwords
newStopWords = ['que','des', 'al', 'del', 'ho', 'd', 'l','per','tambe', 'fins',
               'a', 'cap', 'hi', 'ni', 'no']
stopwords.extend(newStopWords)

In [6]:
delivery_patt = ['repart', 'domicil', 'envi', 'recoll', 'dist']

In [7]:
# comarca_new is necessary because it contains some "repartim a..." text
cols_to_extract_locs = ['COM COMPRAR', 'OBSERVACIONS', 'comarca_new']

In [8]:
com_typos = {
    'Al Urgell':'Alt Urgell',
    'Bages-Moianès':'Moianès',
    'Moianes-Bages':'Moianès',
    'Barcelona':'Barcelonès',
    'Maresme-Barcelonès':'Maresme',
    'Tarragona':'Tarragonès',
    'Baix Montseny':'Vallès Oriental',
    'Baixa Cerdanya':'Cerdanya',
    'Vall Aran':"Vall d'Aran",
    'Alt Maresme':'Maresme',
    'Penedès':'Alt Penedès',
    'Lluçanès':'Osona', #should we consider it a comarca?
            }

## Pre-process columns

In [9]:
data['comarca_new'] = data['COMARCA'].replace(com_typos)

In [10]:
data['comarca_new'] = data['comarca_new'].apply(lambda x: ut.check_comarca_spelling(
    x,locations_df['Comarca'],stopwords) if x not in locations_df['Comarca'] else x)

Not found: Repartim al Bages, Solsonès, Barcelonès i Berguedà


In [11]:
## Creating the field 'comarca_origin': a clean version of 'COMARCA'
data=pd.merge(data,locations_df[['Municipi', 'Comarca']], how='left', left_on='MUNICIPI', right_on='Municipi')

data['comarca_origin']=data.comarca_new
data.loc[data.comarca_origin.str.contains('NOTFOUND'), 'comarca_origin']=data.Comarca
data.drop(['Municipi', 'Comarca'],axis=1,inplace=True)

##TODO: improve this part using the dict municipio to comarca...

In [12]:
#data[data.COMARCA=='Repartim al Bages, Solsonès, Barcelonès i Berguedà'].head()

In [13]:
for col in cols_to_extract_locs+['PRODUCTE(S)']:
    data[col+'_prep'] = data[col].apply(lambda x: ut.pre_process(x, stopwords,sw=True))

for col in ['Municipi', 'Comarca', 'Capital', 'Provincia']:
    locations_df[col+'_prep'] = locations_df[col].apply(lambda x: ut.pre_process(x, stopwords,sw=True))

## New columns

### Payment

In [14]:
data['PAGO'] = ''
pago = {'efectiu':'efectiu', 
        'bizum':'bizum', 
        'transferencia previa':'transferencia', 
        'targeta':'targeta'}
for c in ['COM COMPRAR_prep', 'OBSERVACIONS_prep']:
    for k,v in pago.items():
        if data[data[c].str.contains(r'\b'+v+r'\b')].shape[0]!=0:
            ind = data[data[c].str.contains(r'\b'+v+r'\b')].index
            data.loc[ind,'PAGO'] = data.loc[ind]['PAGO'] +','+ k
data['PAGO'] = data['PAGO'].str.strip(',')

### Locations

In [15]:
for data_field in cols_to_extract_locs:
    for loc in ['Comarca','Capital','Provincia','Municipi']:
        # obtain the locations from the free text fields
        ut.get_text_locations(data, loc.lower(),data_field,locations_df,loc,delivery_patt)

In [16]:
# Dictionary to translate municipis to comarca
mun_to_com_dict = locations_df[locations_df['Municipi']!=''].set_index('Municipi')['Comarca'].to_dict()

In [17]:
data[['capital','municipi']] = data[['capital','municipi']].replace(mun_to_com_dict,regex=True)
data['DONDE']        = (data['capital']+','+data['municipi']+','+data['comarca']
                               ).str.strip(',').str.split(',')
data.drop(['capital','municipi', 'comarca_new_prep','comarca_new','comarca'],axis=1,inplace=True)

In [18]:
data['DONDE'] = data['DONDE'].apply(lambda x: ','.join(set(x)))
data['DONDE'] = data['DONDE'].str.replace(r'\bCatalunya\b','Tota Catalunya')

### Number of delivery regions

In [19]:
# Creating variable about number of comarcas where they deliver:
data['n_comarcas_delivery']=data['DONDE'].apply(lambda x: x.count(',')+1 if 'Catalunya' not in x else 42)


### Binary variables

In [20]:
# create binary variables representing whether they have a payment method or not
for key, val in payment_cat.items():
    data[key]=0
    data.loc[data['PAGO'].str.contains(r'\b'+r'\b|\b'.join(val)+r'\b'),key] = 1
    
# create binary variables representing whether they have a type of product (1) or not
for key, val in product_cat.items():
    data[key]=0
    data.loc[data['PRODUCTE(S)'+'_prep'].str.contains(r'\b'+r'\b|\b'.join(val)+r'\b'),key] = 1
    
# create binary variables representing whether they have a type payment method, contact info... (1) or not
for key, val in com_comprar.items():
    data[key]=0
    data.loc[data['COM COMPRAR'+'_prep'].str.contains(r'\b'+r'\b|\b'.join(val)+r'\b'),key] = 1
# improve the website and social network searches adding another column
data.loc[(data['web']!=1) & (data['OBSERVACIONS'+'_prep'].str.contains(
    r'\b'+r'\b|\b'.join(com_comprar['web'])+r'\b')),'web'] = 1
data.loc[(data['socialnet']!=1) & (data['OBSERVACIONS'+'_prep'].str.contains(
    r'\b'+r'\b|\b'.join(com_comprar['socialnet'])+r'\b')),'socialnet'] = 1

### Number of type of products

In [21]:
# Creating variables about number of products sold:
data['n_main_prod'] = data['meat'] + data['fruit'] + data['vegetables']
data['n_other_prod'] = data['flowers'] + data['charcuterie'] + data['legumes'] + data['mushrooms'] + data['rice'] +\
    data['flour_cereals'] + data['oil_olives_vinager'] + data['eggs'] + data['dairies'] +\
    data['herbs_spices'] + data['hygiene_medicines'] + data['alcohol'] +\
    data['fruit_veggies_products'] + data['drinks'] + data['bread_pastries'] +\
    data['pasta'] + data['others']
data['n_tot_prod'] = data['n_main_prod'] + data['n_other_prod']

### Number of payment methods

In [22]:
# Creating variable about number of payment methods:
data['n_paym_methods']=data.paym_bizum+data.paym_cash+data.paym_card+data.paym_transf

## Save

In [23]:
data.drop(['PRODUCTE(S)_prep','OBSERVACIONS_prep','COM COMPRAR_prep'],axis=1
         ).to_csv(Path(config['input_path']) / 'abastiment_new.csv', index=False)

In [24]:
print(data.columns)
data.head()

Index(['PROJECTE', 'URGENT', 'PRODUCTE(S)', 'MUNICIPI', 'COM COMPRAR',
       'OBSERVACIONS', 'CCPAE', 'COMARCA', 'comarca_origin',
       'COM COMPRAR_prep', 'OBSERVACIONS_prep', 'PRODUCTE(S)_prep', 'PAGO',
       'provincia', 'DONDE', 'n_comarcas_delivery', 'paym_card', 'paym_cash',
       'paym_bizum', 'paym_transf', 'fruit', 'vegetables', 'legumes', 'meat',
       'charcuterie', 'mushrooms', 'rice', 'flour_cereals',
       'oil_olives_vinager', 'eggs', 'dairies', 'herbs_spices',
       'hygiene_medicines', 'alcohol', 'fruit_veggies_products', 'drinks',
       'flowers', 'bread_pastries', 'pasta', 'others', 'iseco', 'delivery',
       'mail', 'web', 'pickup', 'shop', 'market', 'phone', 'orders',
       'whatsapp', 'socialnet', 'n_main_prod', 'n_other_prod', 'n_tot_prod',
       'n_paym_methods'],
      dtype='object')


Unnamed: 0,PROJECTE,URGENT,PRODUCTE(S),MUNICIPI,COM COMPRAR,OBSERVACIONS,CCPAE,COMARCA,comarca_origin,COM COMPRAR_prep,...,shop,market,phone,orders,whatsapp,socialnet,n_main_prod,n_other_prod,n_tot_prod,n_paym_methods
0,3 Fermentats I Un Destí,Sí,1 Pa de Massa Mare Natural de 2kg 2 Vins Natur...,Torelló,Ho repartim a domicili per tot Osona resta de ...,Ens fa il.lusió poder colaborar per oferir-vos...,Sí,Osona,Osona,repartim domicili osona resta catalunya consul...,...,0,0,0,0,0,0,0,3,3,0
1,30 Cabres,,Formatges de llet crua de cabres que pasturen ...,Bellver de Cerdanya,"Contacta per telefon, mail o xarxes socials (@...",Tots els nostres formatges estan elaborats amb...,,Cerdanya,Cerdanya,contacta telefon mail xarxes socials cabres mi...,...,0,0,1,0,0,1,0,1,1,0
2,7 de Ribera Hostal-Agrobotiga,Sí,Verdures de producció propia,Móra d'Ebre,A l'agrobotiga dijous i divendres de 10:00 a 1...,www.biosferacomestible.cat,,Ribera d’Ebre,Ribera d'Ebre,agrobotiga dijous divendres dijous tarde repar...,...,1,0,0,0,0,0,1,0,1,0
3,A Granel,,"Cosmètica, productes d’higiène i neteja ecològics",Barcelona,Venda directa i en punts de recollida per tota...,,Sí,Barcelona,Barcelonès,venda directa punts recollida tota catalunya d...,...,0,0,0,0,0,0,0,1,1,0
4,AgroMontserratí - Cooperativa del Montserratí,,"Fruita, verdura, transformats, elaborats, pins...",Olesa de Montserrat,A través del Whatsap i Telèfon - XXXXXXXXX A l...,www.agromontserrati.cat,,Baix Llobregat,Baix Llobregat,traves whatsap telefon xxxxxxxxx agrobotiga c ...,...,1,0,1,0,1,0,2,2,4,0
