In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential, layers
from tensorflow.keras.callbacks import EarlyStopping

import nltk
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from collections import Counter

from gensim.models import Word2Vec
import gensim.downloader as api

import numpy as np

import json
import os
from os.path import join, exists, dirname

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/george/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
print(list(api.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [6]:
def get_words():
    text_keys = {
        'eurlex': ['title'],
        'consultations': ['title', 'topics'],
        'twitter_query': ['text_en', 'user_desc_en'],
        'twitter_politicians': ['text_en', 'user_desc_en'],
        'twitter_press': ['text_en', 'user_desc_en']
    }

    folder = join(os.path.abspath(''), 'data.json')
    if not exists(folder):  # if folder doesn't exist
        print('Folder not found.')
        return

    files = os.listdir(folder)
    indices = [file.replace('.json', '') for file in files]
    if len(files) == 0:  # if no files
        print('No files found.')
        return

    words = []

    for index, filename in zip(indices, files):
        with open(join(folder, filename), 'r') as file:
            json_ = json.load(file)

        if index.lower().strip() =='synonyms':
            continue

        for i in range(len(json_)):
            for key in text_keys[index]:
                if json_[i][key] is not None:
                    words.extend(word_tokenize(json_[i][key]))
    return words


In [7]:
def get_synonyms(words):
    unique_words = set(words) - set(stopwords.words('english'))
    words_to_keep = set([word for (word, tag) in pos_tag(unique_words, tagset='universal') if tag =='NOUN'])

    word_counts = {k: v for k, v in Counter(words) if k in words_to_keep}
    min_count = sorted(list(word_counts.values()))[int(len(word_counts.values()) * 0.2)]
    words = [word for word in words_to_keep if word_counts[word] > min_count]

    word2vec_transfer = api.load('word2vec-google-news-300')
    synonyms = {word: list(np.array(word2vec_transfer.similar_by_word(word))[:,0]) for word in words}

    with open(join(folder, 'synonyms.json'), 'w') as file:
        json.dump(synonyms, file)

    return synonyms

In [9]:
words = get_words()
words

['Organic',
 'products',
 '-',
 'Annual',
 'reporting',
 'on',
 'controls',
 'and',
 'labelling',
 'Agriculture',
 'and',
 'rural',
 'development',
 'Conversion',
 'to',
 'a',
 'Farm',
 'Sustainability',
 'Data',
 'Network',
 '(',
 'FSDN',
 ')',
 'Agriculture',
 'and',
 'rural',
 'development',
 'Organic',
 'food',
 ':',
 'certificate',
 'for',
 'operators',
 'located',
 'in',
 'third',
 'countries',
 'and',
 'list',
 'of',
 'control',
 'authorities',
 'Agriculture',
 'and',
 'rural',
 'development',
 'Information',
 'and',
 'promotion',
 'measures',
 'for',
 'agricultural',
 'and',
 'food',
 'products',
 'in',
 'the',
 'internal',
 'market',
 'and',
 'in',
 'non-EU',
 'countries',
 'Agriculture',
 'and',
 'rural',
 'development',
 'EU',
 'accession',
 'to',
 'the',
 'Geneva',
 'Act',
 'of',
 'Lisbon',
 'agreement',
 'on',
 'Appellations',
 'of',
 'Origin',
 'and',
 'Geographical',
 'Indications',
 'Agriculture',
 'and',
 'rural',
 'development',
 'Revision',
 'of',
 'EU',
 'marketing'

In [None]:
synonyms = get_synonyms(words)
synonyms

# Consultations: where is status??

In [6]:
with open(join('data.json', 'consultations.json'), 'r') as file:
    json_ = json.load(file)
json_[0]

{'id': 12513,
 'start_timestamp': 1620828496.0,
 'end_timestamp': 1623275999.0,
 'title': 'Organic products - Annual reporting on controls and labelling',
 'topics': 'Agriculture and rural development',
 'type_of_act': 'REG_IMPL',
 'start_date': '2021/05/12',
 'end_date': '2021/06/09',
 'status': 'OPEN',
 'link': 'https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12513-Organic-products---Annual-reporting-on-controls-and-labelling_en'}

In [28]:
import requests

url = 'http://35.223.18.2/indexes/twitter_politicians/search'
key = 'OTkwNzQ0ZGRkZTc0NDcwM2RlMzFlOGIx'
params = {'q': 'agriculture'}
headers = {'X-Meili-API-Key': key}
r = requests.get(url, params=params, headers=headers).json()
r

{'hits': [{'id': 1400110452332449792,
   'timestamp': 1622640119.0,
   'user': 'yjadot',
   'text': 'L’agriculture biologique doit être au cœur de la politique agricole et l’argent de la PAC doit servir à la transition des fermes vers la bio.\nJe suis intervenu cet après-midi au rassemblement de la @fnab_bio  #bio #pac  ',
   'text_en': 'Organic farming must be at the heart of agricultural policy and the money from the CAP must be used for the transition of farms to organic. I spoke this afternoon at the gathering of the @fnab_bio #bio #pac',
   'date': '2021/06/02 15:21:59',
   'lang': 'fr',
   'iso_lang': None,
   'user_verified': True,
   'followers_count': 69395,
   'user_loc': 'Strasbourg, Bruxelles.',
   'user_desc': 'Eurodéputé écologiste 🌍 - @euroecolos - Photos et story sur https://t.co/ljE0qrtnjV',
   'user_desc_en': 'Ecologist MEP 🌍 - @euroecolos - Photos and story on https://t.co/ljE0qrtnjV',
   'user_image': 'http://pbs.twimg.com/profile_images/1096356624254582784/4x

In [22]:
with open(join('data.json', 'conc.json'), 'r') as file:
    json_ = json.load(file)
len(json_)

27369