In [3]:
import spacy 
import pyArango
import os
from os import path
import time
import glob
import pandas as pd

In [4]:
def get_text(path):
    with open(path, encoding='utf8') as f:
        return(f.read().replace('\n',' '))
        f.close()

In [5]:
dir_path = os.getcwd()
dir_path

'/home/paul/projects/text explorer'

In [6]:
textpath = os.path.abspath(path.join(dir_path,'..','extractedText.txt'))

In [7]:
files = glob.glob('/home/paul/projects/text_for_app/*.{}'.format('txt'))
files

['/home/paul/projects/text_for_app/jean_blog.txt',
 '/home/paul/projects/text_for_app/emploi étudiant et inégalités sociales.txt']

In [8]:
def get_filename_from_path(path):
    return os.path.normpath(path).split(os.sep)[-1]

In [9]:
documents = pd.DataFrame({'filepath':files,
                          'doc_name':[get_filename_from_path(filepath) for filepath in files],
                          'doc_number':list(range(0,len(files)))})

In [10]:
documents

Unnamed: 0,filepath,doc_name,doc_number
0,/home/paul/projects/text_for_app/jean_blog.txt,jean_blog.txt,0
1,/home/paul/projects/text_for_app/emploi étudia...,emploi étudiant et inégalités sociales.txt,1


In [11]:
nlp = spacy.load('fr_core_news_lg')

In [12]:
def create_dependancy_df_list(processed_text):
    df_list = []
    for sentence in processed_text.sents:
        token_text, token_dep, token_head_text, token_head_pos = [], [], [], []
        for token in sentence:
            if not token.is_punct and not token.is_stop and not token.is_space:
                token_text.append(token.text)
                token_dep.append(token.dep_), 
                token_head_text.append(token.head.text), 
                token_head_pos.append( token.head.pos_)
        df = pd.DataFrame({'token':token_text,
                           'dep':token_dep,
                           'head_text':token_head_text,
                           'head_pos':token_head_pos})    
        if not df.empty:
            df_list.append(df)
        else:
            pass
    return df_list

In [13]:
file_1_processed = nlp(get_text(documents['filepath'][0]))
file_2_processed = nlp(get_text(documents['filepath'][1]))

In [14]:
create_dependancy_df_list(file_1_processed)[0]

Unnamed: 0,token,dep,head_text,head_pos
0,faire,xcomp,vais,VERB
1,inutilement,advmod,faire,VERB
2,durer,xcomp,faire,VERB
3,suspense,obj,durer,VERB
4,réponse,nsubj,oui,ADV
5,oui,parataxis,vais,VERB


In [15]:
from spacy import displacy

In [16]:
sentence = list(file_1_processed.sents)[0]

In [17]:
displacy.render(sentence)

In [18]:
def get_vocab_table(processed_text):
    tokens, lemmas = [], []
    for token in processed_text:
        if not token.is_punct and not token.is_stop and not token.is_space and not token.is_digit:
            tokens.append(token.text.lower())
            lemmas.append(token.lemma_.lower())
    vocab_table = pd.DataFrame({'token':tokens,
                                'lemma':lemmas})
    return vocab_table

In [19]:
doc_1_vocab = get_vocab_table(file_1_processed)
doc_2_vocab = get_vocab_table(file_2_processed)

Faire une liste avec les valeurs uniques 
- construire un index pour chaque correspondance lemme / token
- construire un dictionnaire avec dedans le lemme et une liste des index correspondants aux tokens
- faire un système d'indexage capable de prendre de nouveaux documents en compte : 0 lemme n 0 token n

In [20]:
index_table_vocab = doc_2_vocab

In [21]:
index_table_vocab = index_table_vocab.drop_duplicates().reset_index(drop=True)

In [22]:
index_table_vocab

Unnamed: 0,token,lemma
0,cairn,cairn
1,ree,ree
2,matieres,matiere
3,reflexion,reflexion
4,emploi,emploi
...,...,...
2827,florence,florence
2828,lefresne,lefresne
2829,vecteurs,vecteur
2830,structurelle,structurel


In [23]:
index_table_vocab['lemma'].value_counts()

exercer      7
faire        7
permettre    6
occuper      5
devenir      5
            ..
3'6          1
ev           1
s@r          1
8'02         1
juillet      1
Name: lemma, Length: 2450, dtype: int64

In [24]:
unique_lemmas = index_table_vocab['lemma'].drop_duplicates().values

In [25]:
lemma_tokens_dictionnaries = []
for lemma in unique_lemmas:
    lemma_tokens_dictionnaries.append({'lemma':lemma,
                                       'tokens':index_table_vocab[index_table_vocab['lemma'] == lemma]})

In [125]:
for lemma in unique_lemmas:
    tokens_of_lemma = index_table_vocab[index_table_vocab['lemma'] == lemma]['token']
    for token in tokens_of_lemma:
        

In [125]:
from arango import ArangoClient

In [27]:
# Initialize the client for ArangoDB.
client = ArangoClient(hosts="http://localhost:8529")

# Connect to "_system" database as root user.
sys_db = client.db("_system", username="root", password="passwd")

In [26]:
exemple_index = index_table_vocab[index_table_vocab['lemma']=='exercer'].index
exemple_index

Int64Index([311, 340, 532, 613, 760, 1333, 1635], dtype='int64')

In [129]:
index_table_vocab['token'][exemple_index]

In [138]:
index_table_vocab[index_table_vocab['lemma']=='exercer']['token']

311      exercée
340      exercer
532       exercé
613     exercent
760     exercées
1333     exercés
1635      exerce
Name: token, dtype: object

utiliser le premier index pour le lemme et faire une boucle sur les autres pour ajouter dans la db multiples tokens pour 1 lemme

Comment ajouter le vocabulaire unique au fur et à mesure de l'ajout de documents :

In [128]:
pd.concat([doc_1_vocab,doc_2_vocab]).drop_duplicates('lemma')

Unnamed: 0,token,lemma
0,faire,faire
1,inutilement,inutilement
2,durer,durer
3,suspense,suspense
4,réponse,réponse
...,...,...
5457,florence,florence
5458,lefresne,lefresne
5460,vecteurs,vecteur
5462,structurelle,structurel


In [165]:
pip install python-arango --upgrade

In [2]:
from arango import ArangoClient

In [140]:
from pyArango.connection import *
from pyArango.collection import Collection, Field, Edges
from pyArango.graph import Graph, EdgeDefinition

Connection à la base de données

In [141]:
conn = Connection(username='root',
                  password='root')

test d'existence de la base de données et création si besoin

In [142]:
if 'text_graph_test' in conn.databases:
    pass
else :
    conn.createDatabase(name='text_graph_test')

In [143]:
db = conn['text_graph_test']

Construction des classes contenant les informations des collections 

In [150]:
class tokens(Collection):
    _fields = {
        'token':Field()
    }

In [151]:
class lemmas(Collection):
    _fields = {
        'lemma':Field()
    }

In [152]:
class contracts_to(Edges):
    _fields = {}

Construction du graphe

In [153]:
class first_graph(Graph):
    _edgeDefinitions = [EdgeDefinition('contracts_to',fromCollections=['tokens'], toCollections=['lemmas'])]
    _orphanedCollections = []

In [154]:
if 'first_graph' not in db.graphs:
    graph = db.createGraph('first_graph')
else :
    graph = db.graphs['first_graph']

Check d'existence et insertion en base de données des collections

In [163]:
if not ('lemmas' in db.collections) & ('tokens' in db.collections):
    db.createCollection('tokens')
    db.createCollection('lemmas')
else :
    pass

Remplissage des tables et lien entre tokens et lemmas

## Ecrit de cette manière, on ajoute autant de fois un lemme qu'il est trouvé dans le dataframe
- Il faut trouver comment unir deux tables aux nombres d'entrées différents

In [157]:
tokens_for_insertion = []
lemmas_for_insertion = []
for i in range(0,doc_1_vocab.shape[0]):
    tokens_for_insertion.append(graph.createVertex('tokens',{'token':doc_1_vocab['token'][i]}))
    lemmas_for_insertion.append(graph.createVertex('lemmas',{'lemma':doc_1_vocab['lemma'][i]}))

CreationError: Unable to create vertice, collection or view not found: tokens. Errors: {'code': 404, 'error': True, 'errorMessage': 'collection or view not found: tokens', 'errorNum': 1203}

In [43]:
for i in range(0,len(tokens_for_insertion)):
    graph.link('contracts_to',tokens_for_insertion[i],lemmas_for_insertion[i],{})

## Recherche d'alternative sur comment insérer les données 

In [None]:
for lemma in unique_lemmas:
    tokens_of_lemma = index_table_vocab[index_table_vocab['lemma'] == lemma]['token']
    a = graph.createVertex('lemmas',{'lemma':lemma})
    for token in tokens_of_lemma:
    b = graph.createVertex    

In [164]:
db.collections

{'contracts_to': ArangoDB collection name: contracts_to, id: 1044948, type: edge, status: loaded,
 '_frontend': ArangoDB collection name: _frontend, id: 951801, type: document, status: loaded,
 '_appbundles': ArangoDB collection name: _appbundles, id: 951798, type: document, status: loaded,
 '_apps': ArangoDB collection name: _apps, id: 951795, type: document, status: loaded,
 '_jobs': ArangoDB collection name: _jobs, id: 951792, type: document, status: loaded,
 '_queues': ArangoDB collection name: _queues, id: 951789, type: document, status: loaded,
 '_aqlfunctions': ArangoDB collection name: _aqlfunctions, id: 951786, type: document, status: loaded,
 '_analyzers': ArangoDB collection name: _analyzers, id: 951783, type: document, status: loaded,
 '_fishbowl': ArangoDB collection name: _fishbowl, id: 1047954, type: document, status: loaded,
 '_graphs': ArangoDB collection name: _graphs, id: 951780, type: document, status: loaded}

Requette depuis la base de données 

In [None]:
aql = '''FOR doc, connection in INBOUND
         FILTER doc."lemmas" == "durer"'''

In [26]:
test_table_insertion = doc_1_vocab

In [27]:
test_table_insertion['index_token'] = list(range(0,test_table_insertion.shape[0]))

[0;31mSignature:[0m [0mgraph[0m[0;34m.[0m[0mcreateVertex[0m[0;34m([0m[0mcollectionName[0m[0;34m,[0m [0mdocAttributes[0m[0;34m,[0m [0mwaitForSync[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m adds a vertex to the graph and returns it
[0;31mFile:[0m      ~/anaconda3/lib/python3.8/site-packages/pyArango/graph.py
[0;31mType:[0m      method
