In [1]:
import spacy 
import pyArango
import os
from os import path
import time
import glob
import pandas as pd

In [2]:
def get_text(path):
    with open(path, encoding='utf8') as f:
        return(f.read().replace('\n',' '))
        f.close()

In [3]:
dir_path = os.getcwd()
dir_path

'/home/paul/projects/arangoDB_test'

In [4]:
textpath = os.path.abspath(path.join(dir_path,'..','extractedText.txt'))

In [5]:
files = glob.glob('/home/paul/projects/text_for_app/*.{}'.format('txt'))
files

['/home/paul/projects/text_for_app/jean_blog.txt',
 '/home/paul/projects/text_for_app/emploi étudiant et inégalités sociales.txt']

In [6]:
def get_filename_from_path(path):
    return os.path.normpath(path).split(os.sep)[-1]

In [7]:
documents = pd.DataFrame({'filepath':files,
                          'doc_name':[get_filename_from_path(filepath) for filepath in files],
                          'doc_number':list(range(0,len(files)))})

In [8]:
documents

Unnamed: 0,filepath,doc_name,doc_number
0,/home/paul/projects/text_for_app/jean_blog.txt,jean_blog.txt,0
1,/home/paul/projects/text_for_app/emploi étudia...,emploi étudiant et inégalités sociales.txt,1


In [9]:
nlp = spacy.load('fr_core_news_lg')

In [10]:
def create_dependancy_df_list(processed_text):
    df_list = []
    for sentence in processed_text.sents:
        token_text, token_dep, token_head_text, token_head_pos = [], [], [], []
        for token in sentence:
            if not token.is_punct and not token.is_stop and not token.is_space:
                token_text.append(token.text)
                token_dep.append(token.dep_), 
                token_head_text.append(token.head.text), 
                token_head_pos.append( token.head.pos_)
        df = pd.DataFrame({'token':token_text,
                           'dep':token_dep,
                           'head_text':token_head_text,
                           'head_pos':token_head_pos})    
        if not df.empty:
            df_list.append(df)
        else:
            pass
    return df_list

In [11]:
file_1_processed = nlp(get_text(documents['filepath'][0]))

In [12]:
create_dependancy_df_list(file_1_processed)[5]

Unnamed: 0,token,dep,head_text,head_pos
0,oui,advmod,est,VERB
1,oui,dep,oui,ADV
2,taille,obl:arg,est,VERB
3,aberration,conj,est,VERB
4,perverse,amod,aberration,NOUN
5,aime,ROOT,aime,VERB
6,beaucoup,advmod,aime,VERB
7,porter,xcomp,aime,VERB


In [33]:
from spacy import displacy

In [34]:
sentence = list(file_1_processed.sents)[0]

In [35]:
displacy.render(sentence)

In [16]:
tokens, lemmas = [], []
for token in file_1_processed:
    if not token.is_punct and not token.is_stop and not token.is_space:
        tokens.append(token.text.lower())
        lemmas.append(token.lemma_.lower())
pd.DataFrame({'token':tokens,
              'lemma':lemmas})

Unnamed: 0,token,lemma
0,faire,faire
1,inutilement,inutilement
2,durer,durer
3,suspense,suspense
4,réponse,réponse
...,...,...
107,évoluer,évoluer
108,cesse,cesse
109,jamais,jamais
110,étonner,étonner


In [17]:
def get_vocab_table(processed_text):
    tokens, lemmas = [], []
    for token in file_1_processed:
        if not token.is_punct and not token.is_stop and not token.is_space:
            tokens.append(token.text.lower())
            lemmas.append(token.lemma_.lower())
    vocab_table = pd.DataFrame({'token':tokens,
                                'lemma':lemmas})
    return vocab_table

In [18]:
doc_1_vocab = get_vocab_table(file_1_processed)
doc_1_vocab

Unnamed: 0,token,lemma
0,faire,faire
1,inutilement,inutilement
2,durer,durer
3,suspense,suspense
4,réponse,réponse
...,...,...
107,évoluer,évoluer
108,cesse,cesse
109,jamais,jamais
110,étonner,étonner


In [19]:
doc_1_vocab['lemma'].value_counts()

vêtement     5
style        3
oui          3
hideux       2
valeur       2
            ..
pied         1
pantalon     1
moudre       1
beaucoup     1
intéresse    1
Name: lemma, Length: 93, dtype: int64

In [20]:
from pyArango.connection import *
from pyArango.collection import Collection, Field, Edges
from pyArango.graph import Graph, EdgeDefinition

Connection à la base de données

In [21]:
conn = Connection(username='root',
                  password='root')

test d'existence de la base de données et création si besoin

In [22]:
if 'text_graph_test' in conn.databases:
    pass
else :
    conn.createDatabase(name='text_graph_test')

In [23]:
db = conn['text_graph_test']

Construction des classes contenant les informations des collections 

In [29]:
class tokens(Collection):
    _fields = {
        'token':Field()
    }

In [30]:
class lemmas(Collection):
    _fields = {
        'lemma':Field()
    }

In [31]:
class contracts_to(Edges):
    _fields = {}

Construction du graphe

In [32]:
class first_graph(Graph):
    _edgeDefinitions = [EdgeDefinition('contracts_to',fromCollections=['tokens'], toCollections=['lemmas'])]
    _orphanedCollections = []

In [30]:
if 'first_graph' not in db.graphs:
    graph = db.createGraph('first_graph')
else :
    graph = db.graphs['first_graph']

Check d'existence et insertion en base de données des collections

In [41]:
if not ('lemmas' in db.collections) & ('tokens' in db.collections):
    db.createCollection('tokens')
    db.createCollection('lemmas')
else :
    pass

Remplissage des tables et lien entre tokens et lemmas

## Ecrit de cette manière, on ajoute autant de fois un lemme qu'il est trouvé dans le dataframe
- Il faut trouver comment unir deux tables aux nombres d'entrées différents

In [42]:
tokens_for_insertion = []
lemmas_for_insertion = []
for i in range(0,doc_1_vocab.shape[0]):
    tokens_for_insertion.append(graph.createVertex('tokens',{'token':doc_1_vocab['token'][i]}))
    lemmas_for_insertion.append(graph.createVertex('lemmas',{'lemma':doc_1_vocab['lemma'][i]}))

In [43]:
for i in range(0,len(tokens_for_insertion)):
    graph.link('contracts_to',tokens_for_insertion[i],lemmas_for_insertion[i],{})

## Recherche d'alternative sur comment insérer les données 

Requette depuis la base de données 

In [None]:
aql = '''FOR doc, connection in INBOUND
         FILTER doc."lemmas" == "durer"'''

In [26]:
test_table_insertion = doc_1_vocab

In [27]:
test_table_insertion['index_token'] = list(range(0,test_table_insertion.shape[0]))

[0;31mSignature:[0m [0mgraph[0m[0;34m.[0m[0mcreateVertex[0m[0;34m([0m[0mcollectionName[0m[0;34m,[0m [0mdocAttributes[0m[0;34m,[0m [0mwaitForSync[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m adds a vertex to the graph and returns it
[0;31mFile:[0m      ~/anaconda3/lib/python3.8/site-packages/pyArango/graph.py
[0;31mType:[0m      method
