In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
!pip install pyTigerGraph

Collecting pyTigerGraph
  Downloading pyTigerGraph-1.6.2-py3-none-any.whl (267 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m267.1/267.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting validators (from pyTigerGraph)
  Downloading validators-0.33.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.3/43.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: validators, pyTigerGraph
Successfully installed pyTigerGraph-1.6.2 validators-0.33.0


In [None]:
import time
import random as rand
from pathlib import Path, PurePosixPath
import pyTigerGraph as tg
import os
from shutil import copyfile

In [None]:
from platform import python_version
print(python_version())

3.10.12


In [None]:
hostName = "localhost"                            # TG server hostname
userName = "tigergraph"                           # TG user name
passWord = "tigergraph"                           # TG password

In [None]:
topK = 10                                         # Number of highest scoring drugs
numDevices = 1                                    # Number of FPGA devices to distribute the queries to

In [None]:
localRepoLocation = Path("/opt/xilinx/apps")
exampleLocation = Path("graphanalytics/integration/Tigergraph-3.x/1.2/examples/drug_similarity/") # when running from github repo
queryFileLocation = localRepoLocation / exampleLocation / "query"

In [None]:
lexique = pd.read_csv('LEX.csv')

In [None]:
lexique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410 entries, 0 to 409
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Source  410 non-null    object
dtypes: object(1)
memory usage: 3.3+ KB


In [None]:
lexique.shape

(410, 1)

In [None]:
mrconso = pd.read_csv('MRCONSO.csv')

In [None]:
mrconso.shape

(410, 1)

In [None]:
# Installazione di spaCy
!pip install -U spacy

# Scaricamento del modello per il inglese
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Etape 0 : importation de la librairie avec l'algorithme
#!pip install gensim si ce n'est pas déjà fait
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

from gensim import models
from gensim.models import Phrases
from gensim import corpora

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import pandas as pd
import numpy as np
import re
import spacy
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Loading data
lex_df = pd.read_csv('LEX.csv')
mrconso_df = pd.read_csv('MRCONSO.csv')

# Text cleaning and normalization
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Stopwords removal
def remove_stopwords(text, language='english'):
    stop_words = set(stopwords.words(language))
    words = text.split()
    return ' '.join([w for w in words if w not in stop_words])

# Initialize spaCy for English biomedical language
nlp = spacy.load('en_core_web_lg')

# Lemmatizing the data
def lemmatize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

# Complete text preprocessing function
def preprocess_text(text):
    text = clean_text(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

# Applicazione della pre-elaborazione direttamente alle colonne interessate
lex_df['Cleaned'] = lex_df['Source'].apply(preprocess_text)
mrconso_df['Cleaned'] = mrconso_df['Target'].apply(preprocess_text)

# Preparazione dei dati per il calcolo della similarità
# Concatenazione temporanea delle colonne pre-elaborate 'Terme' e 'Explication au grand public'
combined_lex = lex_df['Cleaned'].tolist()
cleaned_mrconso = mrconso_df['Cleaned'].tolist()

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# Si assume che lex_df e mrconso_df siano già stati definiti e pre-elaborati dallo Script 1

# Caricamento del Tokenizer e del Modello
tokenizer = AutoTokenizer.from_pretrained('microsoft/BiomedNLP-KRISSBERT-PubMed-UMLS-EL')
model = AutoModel.from_pretrained('microsoft/BiomedNLP-KRISSBERT-PubMed-UMLS-EL')

# Modifica: Funzione per codificare i testi
def encode_textss(texts):
    model.eval()  # Imposta il modello in modalità di valutazione
    with torch.no_grad():  # Non calcolare i gradienti per risparmiare memoria e velocizzare
        encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
        output = model(**encoded_input)
        embeddings = output.last_hidden_state.mean(dim=1)
    return embeddings

# Applicazione dell'encoding
lex_embeddingss = encode_textss(combined_lex)
mrconso_embeddingss = encode_textss(cleaned_mrconso)

# Calcolo della similarità del coseno riga per riga
cosine_similaritiess = []
for i in range(len(lex_embeddingss)):
    cosine_sim = cosine_similarity(lex_embeddingss[i].unsqueeze(0).numpy(), mrconso_embeddingss[i].unsqueeze(0).numpy())
    cosine_similaritiess.append(cosine_sim[0][0])

# Visualizzazione dei risultati
for idx, cos_sim in enumerate(cosine_similaritiess):
    lex_term = lex_df.iloc[idx]['Source']
    matched_term = mrconso_df.iloc[idx]['Target']
    print(f"{lex_term} si allinea a {matched_term} con una similarità cosinus di {cos_sim:.2f}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In the absence of improvement or if symptoms persist, seek medical advice. si allinea a In the absence of improvement or if symptoms persist beyond 7 days of treatment, seek medical advice. con una similarità cosinus di 0.99
In the absence of improvement or if symptoms persist, seek medical advice. si allinea a In the absence of improvement or if symptoms persist, seek medical advice. con una similarità cosinus di 1.00
Additionally, the patient will be warned of the necessity of a rapid consultation in the event of any abnormal vaginal bleeding. si allinea a In case of abnormal vaginal bleeding, it is important to consult your doctor as soon as possible. con una similarità cosinus di 0.98
The administration of paraffin oil to young children, debilitated persons, bedridden individuals, or those with swallowing difficulties should be cautious due to the risk of bronchial aspiration and lipoid pneumonia. si allinea a The administration of paraffin oil in young children, bedridden patients