In [1]:
import requests
import pandas as pd
import os
import numpy as np

from matplotlib import pyplot as plt
%matplotlib inline
%config Inlinebackend.figure_format = 'retina'

import seaborn as sns
sns.set_context('poster')
sns.set(rc={'figure.figsize': (16., 9.)})
sns.set_style('whitegrid')

import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime

In [2]:
API = "https://agridata.ec.europa.eu/extensions/DataPortal/API_Documentation.html"

In [3]:
foods = ["beef","pigmeat","sheepAndGoat","rawMilk","dairy","fruitAndVegetable","cereal","rice","oilseeds","sugar","oliveOil","wine"]

In [4]:
data={}
date = []
price = []
prod = []
var = []
veg = []
for food in foods:
    url = f"https://www.ec.europa.eu/agrifood/api/{food}/prices?memberStateCodes=ES&beginDate=01/01/2001"
    resp = requests.get(url).json()

    for i in resp:
        try:
            date.append(i['beginDate'])
        except: #Rice doesn't have beginDate but has ym which means year month
            date.append(i['ym'])
        price.append(float(i['price'][1:].replace(",",".")))
        try:
            var.append(i['productName'])
        except:
            var.append("unknown")
        try:
            veg.append(i['product'])
        except:
            veg.append('unknown')
        prod.append(food)
    data['product'] = prod
    data['date'] = date
    data['price'] = price
    data['variety'] = var
    data['veg'] = veg

In [5]:
resp = pd.DataFrame(resp)
if not os.path.exists("../mydata/alldata/api_raw"):
    os.makedirs("../mydata/alldata/api_raw")
now = str(datetime.now())[:19].replace(":","_")
resp.to_csv(f"../mydata//alldata/api_raw/api_raw_{now}.csv")

In [6]:
data = pd.DataFrame(data)
data.date = pd.to_datetime(data.date)
data.index = data.date
data = data.sort_index()

In [7]:
o = []
for i, row in data.iterrows():
    if row['variety'] != 'unknown':
        o.append(row['variety'].replace(" ","_").lower())
        
    elif row['veg'] != 'unknown':
        o.append(row['veg'].replace(" ","_").lower())

    else:
        o.append(row['product'].replace(" ","_").lower())
data ['product'] = o

In [8]:
price = []
for i, row in data.iterrows():
    if row['price'] != 0.00 or row['price'] != 0.0:
        price.append(row['price'])
    else:
        price.append(np.nan)
data['price'] = price
data.dropna(how='any',inplace=True)

In [9]:
# Translating product names to simplified Spanish to use them as keywords to search later
names = {'abricots':'albaricoque',
 'apples':'manzana',
 'asparagus':'espárrago',
 'beans':'judías',
 'beef':'ternera',
 'butter':'mantequilla',
 'cabbages':'col',
 'carrots':'zanahoria',
 'cauliflowers':'coliflor',
 'cherries':'cereza',
 'clementines':'clementina',
 'courgettes':'calabacín',
 'crude_olive-pomace_oil_(from_5_to_10%)':'aceite de orujo de oliva crudo de 5 a 10%',
 'crude_soya_bean_oil':'salsa de soja',
 'crude_sunflower_oil':'aceite de girasol',
 'cucumbers': 'pepino',
 'durum_wheat':'harina de trigo candeal',
 'edam':'queso edam',
 'egg_plants,_aubergines':'berenjena',
 'emmental':'queso emmental',
 'extra_virgin_olive_oil_(up_to_0.8%)':'aceite de oliva virgen extra (hasta 0.8%)',
 'feed_barley':'cebada',
 'garlic':'ajo',
 'lampante_olive_oil_(2%)':'aceite de oliva virgen lampante',
 'leeks':'puerro',
 'lemons':'limón',
 'lettuces':'lechuga',
 'maize':'maíz',
 'malting_barley':'malta',
 'mandarins':'mandarina común',
 'melons':'melón',
 'milling_wheat':'harina de trigo',
 'mushrooms,_cultivated':'champiñón',
 'nectarines':'nectarina',
 'olive-pomace_oil_(up_to_1%)':'aceite de orujo de oliva (hasta 1%)',
 'onions':'cebolla',
 'oranges':'naranja',
 'organic_raw_milk':'leche entera orgánica',
 'peaches':'melocotón',
 'pears':'pera',
 'peppers':'pimiento',
 'pigmeat':'cerdo',
 'plums':'ciruela',
 'rapeseed':'colza',
 'raw_milk':'leche entera',
 'refined_olive-pomace_oil_(up_to_0.3%)':'aceite de orujo de oliva refinado (hasta 3%)',
 'refined_olive_oil_(up_to_0.3%)':'aceite de oliva refinado (hasta 3%)',
 'rice':'arroz',
 'satsumas':'mandarina satsuma',
 'sheepandgoat':'cabra y oveja',
 'smp':'leche semidesnatada en polvo',
 'soya_meal':'harina de soja',
 'strawberries':'fresa',
 'sugar':'azúcar',
 'sunflower_seed':'pipas de girasol',
 'sunflower_seed_meal':'harina de pipas de girasol',
 'table_grapes':'uva',
 'tomatoes':'tomate',
 'virgin_olive_oil_(up_to_2%)':'aceite de oliva virgen (hasta 2%)',
 'water_melons':'sandía',
 'wheypowder':'suero de leche',
 'wine':'vino',
 'wmp':'leche entera en polvo'}

In [10]:
data['product'] = data['product'].map(names)

In [11]:
data

Unnamed: 0_level_0,product,date,price,variety,veg
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2001-01-01,arroz,2001-01-01,300.44,unknown,unknown
2001-01-01,ternera,2001-01-01,122.90,unknown,unknown
2001-01-01,cerdo,2001-01-01,156.08,unknown,unknown
2001-01-01,cerdo,2001-01-01,156.08,unknown,unknown
2001-01-01,ternera,2001-01-01,277.25,unknown,unknown
...,...,...,...,...,...
2021-12-07,aceite de oliva virgen (hasta 2%),2021-12-07,317.50,unknown,Virgin olive oil (up to 2%)
2021-12-07,aceite de oliva refinado (hasta 3%),2021-12-07,296.50,unknown,Refined olive oil (up to 0.3%)
2021-12-07,aceite de oliva virgen lampante,2021-12-07,293.50,unknown,Lampante olive oil (2%)
2021-12-07,cerdo,2021-12-07,173.48,unknown,unknown


In [12]:
data.drop(['date','variety','veg'], axis=1, inplace=True)

In [13]:
price = []
for i, row in data.iterrows():
    if row['price'] != 0.00 or row['price'] != 0.0:
        price.append(row['price'])
    else:
        price.append(np.nan)
data['price'] = price
data.dropna(how='any',inplace=True)

In [14]:
foods = sorted(list(set(data['product'])))

In [15]:
for i in sorted(list(set(data['product']))):
    x = len(data[data['product']==i])
    print(f"{i} has {x} rows")

aceite de girasol has 93 rows
aceite de oliva refinado (hasta 3%) has 2267 rows
aceite de oliva virgen (hasta 2%) has 4337 rows
aceite de oliva virgen extra (hasta 0.8%) has 3754 rows
aceite de oliva virgen lampante has 3748 rows
aceite de orujo de oliva (hasta 1%) has 6 rows
aceite de orujo de oliva crudo de 5 a 10% has 1966 rows
aceite de orujo de oliva refinado (hasta 3%) has 1958 rows
ajo has 1652 rows
albaricoque has 202 rows
arroz has 3269 rows
azúcar has 180 rows
berenjena has 765 rows
cabra y oveja has 724 rows
calabacín has 769 rows
cebada has 435 rows
cebolla has 545 rows
cerdo has 4042 rows
cereza has 176 rows
champiñón has 841 rows
ciruela has 222 rows
clementina has 376 rows
col has 41 rows
coliflor has 742 rows
colza has 47 rows
espárrago has 111 rows
fresa has 294 rows
harina de pipas de girasol has 46 rows
harina de soja has 43 rows
harina de trigo has 440 rows
harina de trigo candeal has 355 rows
judías has 720 rows
leche entera has 203 rows
leche entera en polvo has 1

In [16]:
#Dropping data with less than 50 records
gooddata = []
for i, row in data.iterrows():
    if row['product']=='leche entera en polvo':
        gooddata.append(np.nan)
    elif row['product']=='leche entera orgánica':
        gooddata.append(np.nan)
    elif row['product']=='aceite de orujo de oliva (hasta 1%)':
        gooddata.append(np.nan)
    elif row['product']=='col':
        gooddata.append(np.nan)
    elif row['product']=='colza':
        gooddata.append(np.nan)
    elif row['product']=='harina de soja':
        gooddata.append(np.nan)
    elif row['product']=='salsa de soja':
        gooddata.append(np.nan)
    else:
        gooddata.append(row['product'])
data['product'] = gooddata
data.dropna(how='any',inplace=True)

In [19]:
data.tail()

Unnamed: 0_level_0,product,price
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-07,aceite de oliva virgen (hasta 2%),317.5
2021-12-07,aceite de oliva refinado (hasta 3%),296.5
2021-12-07,aceite de oliva virgen lampante,293.5
2021-12-07,cerdo,173.48
2021-12-07,pipas de girasol,533.7


In [18]:
if not os.path.exists("../mydata/alldata/cleandata"):
    os.makedirs("../mydata/alldata/cleandata")
now = str(datetime.now())[:19].replace(":","_")
data.to_csv(f"../mydata/alldata/cleandata/data_{now}.csv")
data.to_csv(f"../mydata/alldata/cleandata/data.csv")