In [1]:
import pandas as pd
from arango import ArangoClient

import numpy as np

# connexion à la base de données
client = ArangoClient(hosts="http://localhost:8529")
sys_db = client.db('_system', username='root',password='root')
db = client.db("text", username="root", password="root")

## graphe de similitudes

In [2]:
sentences_ID_df = pd.DataFrame(list(
    db.aql.execute('''
    FOR doc in sentences
    return doc._id
    ''')
    )
                              )
sentences_ID_df

Unnamed: 0,0
0,sentences/doc0sent0
1,sentences/doc0sent1
2,sentences/doc0sent10
3,sentences/doc0sent100
4,sentences/doc0sent1000
...,...
22946,sentences/doc3sent95
22947,sentences/doc3sent96
22948,sentences/doc3sent97
22949,sentences/doc3sent98


In [3]:
sentences_ID_df.iloc[0,0]

'sentences/doc0sent0'

In [4]:
lemmas_from_sentences = pd.DataFrame(list(db.aql.execute('''for start_vertex in sentences
        for v, e in inbound start_vertex is_from
        filter e.type == 'lemmaToSent'
        collect sent = e._to, lemmas = v.lemma into groups ={
        "sentence" : e._to,
        "lemma" : v.lemma
        }
        return {"sentence":sent,
                "lemma":lemmas}
                '''))).groupby('sentence')['lemma'].apply(' '.join)
lemmas_from_sentences

sentence
sentences/doc0sent0                                                     30
sentences/doc0sent1                                                     -on
sentences/doc0sent10      1(2 1(4 and centre control correct credential ...
sentences/doc0sent1000                                           and social
sentences/doc0sent1001                           and based be been involved
                                                ...                        
sentences/doc3sent93                          cned dynamique employeur fois
sentences/doc3sent94                                                  doute
sentences/doc3sent96                                               fonction
sentences/doc3sent97      activité attribut cadre dela enseignement entr...
sentences/doc3sent98       destin donner douvrier enseigner formation forme
Name: lemma, Length: 18693, dtype: object

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import minmax_scale
import networkx as nx
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default='browser'


In [6]:
vectorizer = CountVectorizer(min_df=5)

termDocMatrix  = vectorizer.fit_transform(lemmas_from_sentences)

In [10]:
coOccurenceMatrix = termDocMatrix.T.dot(termDocMatrix)
# retire les liens d'un nodeà lui même dnas la matrice
coOccurenceMatrix

In [None]:

# construction du graphe
G = nx.from_scipy_sparse_array(coOccurenceMatrix,
                                parallel_edges=False)

# retire les arretes qui connectent un noeud à lui même 

G.remove_edges_from(nx.selfloop_edges(G))



# définition de la position des noeuds par spatialisation fruchterman reingold

FRL = nx.drawing.layout.fruchterman_reingold_layout(G)

# kamada kawai
# KMK = nx.drawing.layout.kamada_kawai_layout(G)

for i in range(0,len(FRL)):
    G.nodes[i]['pos'] = FRL[i]
    


def make_edge(x, y, width,scaledWidth):
    """
    Args:
        x: a tuple of the x from and to, in the form: tuple([x0, x1, None])
        y: a tuple of the y from and to, in the form: tuple([y0, y1, None])
        width: The width of the line

    Returns:
        a Scatter plot which represents a line between the two points given. 
    """
    return  go.Scatter(
                x=x,
                y=y,
                line=dict(width=width,color='#888'),
                hoverinfo='none',
                mode='lines',
                opacity=scaledWidth)




xTupleList = []
yTupleList = []

for ed in G.edges(): 
    xfrom = G.nodes()[ed[0]]['pos'][0]
    yfrom = G.nodes()[ed[0]]['pos'][1]
    
    xto = G.nodes()[ed[1]]['pos'][0]
    yto = G.nodes()[ed[1]]['pos'][1]
    xTupleList.append((xfrom,xto,None))
    yTupleList.append((yfrom,yto,None))

widthList = np.array([G.edges[ed]['weight'] for ed in G.edges()])

scaledWidthList = minmax_scale(widthList)



edge_trace = [make_edge(x,y,w,sw) for x,y,w,sw in zip(xTupleList,yTupleList,widthList*0.5,scaledWidthList)]




node_x = []
node_y = []
for node in G.nodes():
    x, y = G.nodes[node]['pos']
    node_x.append(x)
    node_y.append(y)
    
    
    

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        # colorscale options
        #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
        #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
        #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
        colorscale='YlGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title="Nombre d'occurences",
            xanchor='left',
            titleside='right'
        ),
        line_width=2))





node_trace.marker.color = list(vectorizer.vocabulary_.values())
node_trace.text = list(vectorizer.vocabulary_.keys())


layout = go.Layout(
    paper_bgcolor='rgba(0,0,0,0)', # transparent background
    plot_bgcolor='rgba(0,0,0,0)', # transparent 2nd background
    xaxis =  {'showgrid': False, 'zeroline': False}, # no gridlines
    yaxis = {'showgrid': False, 'zeroline': False}, # no gridlines
)


fig = go.Figure(layout = layout)

# Add all edge traces
for trace in edge_trace:
    fig.add_trace(trace)# Add node trace
fig.add_trace(node_trace)# Remove legend
fig.update_layout(showlegend = False)


## graphe syntagmatique

sur la base d'un lemme, obtenir :
- les tokens associés
- les relations syntagmatiques de ces tokens
- les tokens à l'issue de ces relations
- la source de ces relations (pour l'instant le document mais se donner l'option d'ajouter d'autres variables à l'avenir)

faire une requête de tous les tokens liés à un lemme avec un groupement par phrase
- récupérer le doc depuis l'id de la phrase

In [58]:
list(db.collection('tokens').find({'token':"durée"}))

[{'_key': 'token1769',
  '_id': 'tokens/token1769',
  '_rev': '_eMbiN6W-A-',
  'token': 'durée'}]

In [38]:
token_of_interest = 'tokens/token2'

In [51]:
pd.DataFrame(list(db.aql.execute(f'''
                        FOR doc, connection in 1..1 any
                        {'"'+ token_of_interest + '"'} syntagmatic_link
                         return connection
                    ''')))

Unnamed: 0,_key,_id,_from,_to,_rev,dep_relation,from_sentence_number,head_pos_tag
0,1318149,syntagmatic_link/1318149,tokens/token2,tokens/token3,_eMbiOAu--D,obj,doc0sent0,VERB
1,1318146,syntagmatic_link/1318146,tokens/token0,tokens/token2,_eMbiOAu--A,xcomp,doc0sent0,VERB


## concordancier

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [219]:
words_of_interest = ['point','vue']

In [238]:
def search_tokens(tokens_list,doclist=[]):
    
    tokens_ids=  list(db.aql.execute(f'''let selection =  {tokens_list}
for token in tokens 
filter token.token in selection
return token._id'''))
    
    
    dataframes_list = []
    for word in tokens_ids:
        content = (list(db.aql.execute(f"""for v, e in 1..1
    outbound '{word}'
    is_from
    return v.content""")))

        from_doc = (list(db.aql.execute(f"""for v, e in 2..2
    outbound '{word}'
    is_from
    return v.doc_name""")))

        dataframes_list.append(pd.DataFrame({'sentence':content,
                                             'from_doc':from_doc}))
        
        df_recherche = pd.concat(dataframes_list).drop_duplicates()
        
        sentences_with_comparator = pd.concat([df_recherche,
          pd.DataFrame({'sentence':[' '.join(tokens_list)],
              'from_doc':['comparateur']})]
         ).reset_index(drop=True)
        
        vectorizer = TfidfVectorizer()
        vector_text = vectorizer.fit_transform(sentences_with_comparator['sentence'])
        similarity = cosine_similarity(vector_text[vector_text.shape[0]-1,:],vector_text)
        sentences_with_comparator['similarity'] = similarity[0]
        sentences_with_comparator = sentences_with_comparator.sort_values('similarity', ascending=False)\
                           .reset_index(drop=True)
        
        
        
        
        final_result = sentences_with_comparator[sentences_with_comparator['from_doc'] != 'comparateur']
    if len(doclist) == 0:
    
        return final_result
    elif len(doclist) > 0: 
        return final_result[final_result['from_doc'].isin(doclist)]

### text version

In [305]:
def search_tokens(tokens_list,doclist=[]):
    
    tokens_ids=  list(db.aql.execute(f'''let selection =  {tokens_list}
for token in tokens 
filter token.token in selection
return token._id'''))
    
    
    dataframes_list = []
    for word in tokens_ids:
        content = (list(db.aql.execute(f"""for v, e in 1..1
    outbound '{word}'
    is_from
    return v.content""")))

        from_doc = (list(db.aql.execute(f"""for v, e in 2..2
    outbound '{word}'
    is_from
    return v.doc_name""")))

        dataframes_list.append(pd.DataFrame({'sentence':content,
                                             'from_doc':from_doc}))
        
        df_recherche = pd.concat(dataframes_list).drop_duplicates()
        
        sentences_with_comparator = pd.concat([df_recherche,
          pd.DataFrame({'sentence':[' '.join(tokens_list)],
              'from_doc':['comparateur']})]
         ).reset_index(drop=True)
        
        vectorizer = TfidfVectorizer()
        vector_text = vectorizer.fit_transform(sentences_with_comparator['sentence'])
        similarity = cosine_similarity(vector_text[vector_text.shape[0]-1,:],vector_text)
        sentences_with_comparator['similarity'] = similarity[0]
        sentences_with_comparator = sentences_with_comparator.sort_values('similarity', ascending=False)\
                           .reset_index(drop=True)
        
        
        
        
        final_result = sentences_with_comparator[sentences_with_comparator['from_doc'] != 'comparateur']
    if len(doclist) == 0:
    
        final_result = final_result
    elif len(doclist) > 0: 
        final_result =  final_result[final_result['from_doc'].isin(doclist)]
        
    
    list_sentences = []
    for sentence, doc, similarity in zip(final_result['sentence'],
                                      final_result['from_doc'],
                                      final_result['similarity']):
        list_sentences.append(f'Tiré du document : {doc} | Prévalence dans la phrase : {similarity} \n __ \n \n {sentence} \n __ \n \n')
    return ' '.join(list_sentences)
    

In [306]:
print(search_tokens(['sociale']))

Tiré du document : lomonaco 1.txt | Prévalence dans la phrase : 0.20122345559870214 
 __ 
 
 Traité de psychologie sociale 
 __ 
 
 Tiré du document : girandola 3.txt | Prévalence dans la phrase : 0.19400130669475918 
 __ 
 
 Intégration sociale Resultats 
 __ 
 
 Tiré du document : girandola 2.txt | Prévalence dans la phrase : 0.16654433087118806 
 __ 
 
 Plus grande valeur sociale 
 __ 
 
 Tiré du document : girandola 3.txt | Prévalence dans la phrase : 0.15871111879061645 
 __ 
 
 Les explications internes permettent une valorisation sociale. 
 __ 
 
 Tiré du document : girandola 2.txt | Prévalence dans la phrase : 0.14558004690914536 
 __ 
 
 La norme d’internalité est une norme sociale. 
 __ 
 
 Tiré du document : différentielle.txt | Prévalence dans la phrase : 0.13138607847157155 
 __ 
 
  Intelligence : caractéristiques d’adaptation (environnementale, sociale…) 
 __ 
 
 Tiré du document : girandola 3.txt | Prévalence dans la phrase : 0.12738816870944822 
 __ 
 
 Jugés plus posi

In [239]:
wtv = search_tokens(['sociale'])
wtv

Unnamed: 0,sentence,from_doc,similarity
1,Traité de psychologie sociale,lomonaco 1.txt,0.201223
2,Intégration sociale Resultats,girandola 3.txt,0.194001
3,Plus grande valeur sociale,girandola 2.txt,0.166544
4,Les explications internes permettent une valor...,girandola 3.txt,0.158711
5,La norme d’internalité est une norme sociale.,girandola 2.txt,0.14558
6,Intelligence : caractéristiques d’adaptation ...,différentielle.txt,0.131386
7,Jugés plus positivement sur l’intégration sociale,girandola 3.txt,0.127388
8,Le groupe est un objet essentiel de la psychol...,girandola 2.txt,0.11962
9,Explication causale : un des domaines de prédi...,lomonaco 1.txt,0.106923
10,Elle est très importante en psychologie social...,girandola 1.txt,0.101711


In [262]:
words_of_interest = ['face','tâche']

In [263]:
match_index = []
for sentence, index in zip(wtv['sentence'],wtv.index):
    if all(word.lower() in sentence.lower() for word in words_of_interest):
        match_index.append(index)

In [265]:
match_index

[24]

In [307]:
def search_tokens(tokens_list,doclist=[]):
    
    tokens_ids=  list(db.aql.execute(f'''let selection =  {tokens_list}
for token in tokens 
filter token.token in selection
return token._id'''))
    
    
    dataframes_list = []
    for word in tokens_ids:
        content = (list(db.aql.execute(f"""for v, e in 1..1
    outbound '{word}'
    is_from
    return v.content""")))

        from_doc = (list(db.aql.execute(f"""for v, e in 2..2
    outbound '{word}'
    is_from
    return v.doc_name""")))

        dataframes_list.append(pd.DataFrame({'sentence':content,
                                             'from_doc':from_doc}))
        
        df_recherche = pd.concat(dataframes_list).drop_duplicates()
        
        sentences_with_comparator = pd.concat([df_recherche,
          pd.DataFrame({'sentence':[' '.join(tokens_list)],
              'from_doc':['comparateur']})]
         ).reset_index(drop=True)
        
        vectorizer = TfidfVectorizer()
        vector_text = vectorizer.fit_transform(sentences_with_comparator['sentence'])
        similarity = cosine_similarity(vector_text[vector_text.shape[0]-1,:],vector_text)
        sentences_with_comparator['similarity'] = similarity[0]
        sentences_with_comparator = sentences_with_comparator.sort_values('similarity', ascending=False)\
                           .reset_index(drop=True)
             
        
        
        sentences = sentences_with_comparator[sentences_with_comparator['from_doc'] != 'comparateur']
        
        # réduit le résultat à seulement les documents où tous les mots apparaissent
        match_index = []
        for sentence, index in zip(sentences['sentence'],sentences.index):
            if all(word.lower() in sentence.lower() for word in tokens_list):
                match_index.append(index)
        final_result = sentences.loc[match_index,:]
        
    
    if len(doclist) == 0:
        return final_result
    
    elif len(doclist) > 0: 
        return final_result[final_result['from_doc'].isin(doclist)]

In [313]:
df_results = search_tokens(['sociale'],['girandola 1.txt','girandola 2.txt'])
df_results

Unnamed: 0,sentence,from_doc,similarity
3,Plus grande valeur sociale,girandola 2.txt,0.166544
5,La norme d’internalité est une norme sociale.,girandola 2.txt,0.14558
8,Le groupe est un objet essentiel de la psychol...,girandola 2.txt,0.11962
10,Elle est très importante en psychologie social...,girandola 1.txt,0.101711
11,Permet de produire une image positive en situa...,girandola 2.txt,0.099689
12,Plusieurs auteurs considèrent que l’erreur fon...,girandola 2.txt,0.098912
14,La norme repose toujours sur une attribution d...,girandola 2.txt,0.090464
17,"La norme sociale d’internalité est valorisant,...",girandola 2.txt,0.084735
19,La norme fait l’objet d’un apprentissage socia...,girandola 2.txt,0.082896
25,Norme d’internalité : la valorisation sociale ...,girandola 2.txt,0.064599


## fonction trop grande

In [314]:

def search_tokens(tokens_list,doclist=[]):
    
    tokens_ids=  list(db.aql.execute(f'''let selection =  {tokens_list}
for token in tokens 
filter token.token in selection
return token._id'''))
    
    
    dataframes_list = []
    for word in tokens_ids:
        content = (list(db.aql.execute(f"""for v, e in 1..1
    outbound '{word}'
    is_from
    return v.content""")))

        from_doc = (list(db.aql.execute(f"""for v, e in 2..2
    outbound '{word}'
    is_from
    return v.doc_name""")))

        dataframes_list.append(pd.DataFrame({'sentence':content,
                                             'from_doc':from_doc}))
        
        df_recherche = pd.concat(dataframes_list).drop_duplicates()
        
        sentences_with_comparator = pd.concat([df_recherche,
          pd.DataFrame({'sentence':[' '.join(tokens_list)],
              'from_doc':['comparateur']})]
         ).reset_index(drop=True)
        
        vectorizer = TfidfVectorizer()
        vector_text = vectorizer.fit_transform(sentences_with_comparator['sentence'])
        similarity = cosine_similarity(vector_text[vector_text.shape[0]-1,:],vector_text)
        sentences_with_comparator['similarity'] = similarity[0]
        sentences_with_comparator = sentences_with_comparator.sort_values('similarity', ascending=False)\
                           .reset_index(drop=True)
        sentences = sentences_with_comparator[sentences_with_comparator['from_doc'] != 'comparateur']
          # réduit le résultat à seulement les documents où tous les mots apparaissent
        match_index = []
        for sentence, index in zip(sentences['sentence'],sentences.index):
            if all(word.lower() in sentence.lower() for word in tokens_list):
                match_index.append(index)
        if len(match_index) == 0:
            return ('not found')
        elif len(match_index) > 0:
            final_result = sentences.loc[match_index,:]
            return final_result

In [318]:
type(search_tokens(['attribn'],['lomonaco 2.txt']))

NoneType

# todo :
construire une requette qui récupère seulement les textes où tous les mots sont présents pas seulement un d'entre eux.

In [None]:
def search(liste_words,black_liste_words=[]):
    # On récupère dans la liste des indicateurs ceux qui contiennent tous les mots clés données en argument
    filtered_list = [x for x in indicNames if
              all(y in x.lower() for y in liste_words) and
               not any(z in x.lower() for z in black_liste_words)] # avec option de retirer certains termes indésirés
    return filtered_list

In [296]:

def search(input1,input2):
    
    if input1 is None and input2 is None:
        return 'En attente de termes de recherche.'
    elif input1 is not None and input2 is None:
        df_search = search_tokens(input1.split(' '))
        if df_search is None :
            return "Mot non trouvé dans le corpus."
        elif df_search.shape[0] == 0:
            return "Un des mots recherchés est absent du corpus."
        elif df_search.shape[0] > 0:
            return df_to_text(df_search)
    elif input1 is not None and input2 is not None:
        df_search = search_tokens(input1.split(' '))
        if df_search is None :
            return "Mot non trouvé dans le corpus."
        elif df_search.shape[0] == 0:
            return "Un des mots recherchés est absent du corpus."
        elif df_search.shape[0] > 0:
            df_search_filtered = filter_on_docs(df_search,input2)
            
            if df_search_filtered is None:
                return 'Terme absent dans les documents sélectionnés.'
                
            elif df_search_filtered is not None :
                return df_to_text(df_search_filtered)
    

['hello', 'boi']