## Imports

In [1]:
from pathlib import Path # reads paths in the current OS
import unicodedata
import pandas as pd
import numpy as np
import json
import re
import utils as ut
from stop_words import get_stop_words
from unidecode import unidecode

## Reading

In [2]:
config       = json.load(open('paths.cfg'))

data         = pd.read_csv(Path(config['input_path']) / "abastiment.csv", sep=",").fillna('')
locations_df = pd.read_csv(Path(config['input_path']) /  'municipis_merge.csv').fillna('')

com_comprar  = json.load(open('com_comprar_kw.cfg'))
product_cat  = json.load(open('product_list.cfg'))

In [3]:
data.head()

Unnamed: 0,PROJECTE,URGENT,PRODUCTE(S),MUNICIPI,COM COMPRAR,OBSERVACIONS,CCPAE,COMARCA
0,3 Fermentats I Un Destí,Sí,1 Pa de Massa Mare Natural de 2kg 2 Vins Natur...,Torelló,Ho repartim a domicili per tot Osona resta de ...,Ens fa il.lusió poder colaborar per oferir-vos...,Sí,Osona
1,30 Cabres,,Formatges de llet crua de cabres que pasturen ...,Bellver de Cerdanya,"Contacta per telefon, mail o xarxes socials (@...",Tots els nostres formatges estan elaborats amb...,,Cerdanya
2,7 de Ribera Hostal-Agrobotiga,Sí,Verdures de producció propia,Móra d'Ebre,A l'agrobotiga dijous i divendres de 10:00 a 1...,www.biosferacomestible.cat,,Ribera d’Ebre
3,A Granel,,"Cosmètica, productes d’higiène i neteja ecològics",Barcelona,Venda directa i en punts de recollida per tota...,,Sí,Barcelona
4,AgroMontserratí - Cooperativa del Montserratí,,"Fruita, verdura, transformats, elaborats, pins...",Olesa de Montserrat,A través del Whatsap i Telèfon - XXXXXXXXX A l...,www.agromontserrati.cat,,Baix Llobregat


In [4]:
locations_df.head()

Unnamed: 0,Municipi,Comarca,Capital,Provincia
0,Abella de la Conca,Pallars Jussà,Tremp,Lleida
1,Castell de Mur,Pallars Jussà,Tremp,Lleida
2,Conca de Dalt,Pallars Jussà,Tremp,Lleida
3,Gavet de la Conca,Pallars Jussà,Tremp,Lleida
4,Isona i Conca Dellà,Pallars Jussà,Tremp,Lleida


In [5]:
#load a set of stop words
stopwords = get_stop_words('catalan')
#add new stopwords
newStopWords = ['que','des', 'al', 'del', 'ho', 'd', 'l','per','tambe', 'fins',
               'a', 'cap', 'hi', 'ni', 'no']
stopwords.extend(newStopWords)

In [6]:
delivery_patt = ['repart', 'domicil', 'envi', 'recoll', 'dist']

## Pre-process columns

In [7]:
for col in ['COM COMPRAR', 'PRODUCTE(S)', 'OBSERVACIONS']:
    data[col] = data[col].apply(lambda x: ut.pre_process(x, stopwords,sw=True))
data['MUNICIPI'] = data['MUNICIPI'].apply(lambda x: ut.pre_process(x))

for col in ['Municipi', 'Comarca', 'Capital', 'Provincia']:
    locations_df[col] = locations_df[col].apply(lambda x: ut.pre_process(x, stopwords,sw=True))

In [8]:
com_typos = {
    'Al Urgell':'Alt Urgell',
    'Bages-Moianes':'Moianes',
    'Moianes-Bages':'Moianes',
    'Barcelona':'Barcelones',
    'Maresme-Barcelones':'Maresme',
    'Tarragona':'Tarragones',
    'Baix Montseny':'Valles Oriental',
    'Baixa Cerdanya':'Cerdanya'
            }
data['comarca_new'] = data['COMARCA'].apply(lambda row: re.sub("[\u0300-\u036f]", "", 
                                                           unicodedata.normalize('NFD', row))).replace(com_typos)

## New columns

In [9]:
# get a list of all possible locations at all levels (municipality, province...)
locations = locations_df['Municipi'].unique().tolist()+locations_df['Comarca'].unique().tolist() + locations_df[
    'Capital'].unique().tolist() + locations_df['Provincia'].unique().tolist()
locations = [x for x in locations if x != '']

### Locations

In [10]:
# obtain the locations from the free text fields
ut.get_text_locations(data, 'locations','COM COMPRAR',locations,delivery_patt)
ut.get_text_locations(data, 'comarca','COM COMPRAR',
                   locations_df['Comarca'].unique().tolist(),delivery_patt)
ut.get_text_locations(data, 'capital','COM COMPRAR',
                   locations_df['Capital'].unique().tolist(),delivery_patt)
ut.get_text_locations(data, 'provincia','COM COMPRAR',
                   locations_df['Provincia'].unique().tolist(),delivery_patt)

### Binary variables

In [11]:
# create binary variables representing whether they have a type of product (1) or not
for key, val in product_cat.items():
    data[key]=0
    data.loc[data['PRODUCTE(S)'].str.contains('|'.join(val)),key] = 1
    
# create binary variables representing whether they have a type payment method, contact info... (1) or not
for key, val in com_comprar.items():
    data[key]=0
    data.loc[data['COM COMPRAR'].str.contains('|'.join(val)),key] = 1
# improve the website and social network searches adding another column
data.loc[(data['web']!=1) & (data['OBSERVACIONS'].str.contains('|'.join(com_comprar['web']))),
  'web'] = 1
data.loc[(data['socialnet']!=1) & (data['OBSERVACIONS'].str.contains('|'.join(com_comprar['socialnet']))),
  'socialnet'] = 1

### Number of type of products

In [12]:
# Creating variables about number of products sold:
data['n_prod_princ'] = data['meat'] + data['fruit'] + data['vegetables']
data['n_prod_others'] = data['flowers'] + data['legumes'] + data['mushrooms'] + data['rice'] +\
    data['flour_cereals'] + data['oil_olives_vinager'] + data['eggs'] + data['dairies'] +\
    data['herbs_spices'] + data['hygiene_medicines'] + data['alcohol'] +\
    data['fruit_veggies_products'] + data['drinks'] + data['bread_pastries'] +\
    data['pasta'] + data['others']
data['n_prod_tot'] = data['n_prod_princ'] + data['n_prod_others']

## Save

In [13]:
data.to_csv(Path(config['input_path']) / 'abastiment_new.csv', index=False)