In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load the CSV file into a DataFrame
file_path = 'G:\Praktikum\MLE\Master_mle\data\mqp.csv'
df = pd.read_csv(file_path, names=['claster', 'lable', 'phrase', 'target'])

df.head()


Unnamed: 0,claster,lable,phrase,target
0,1,After how many hour from drinking an antibioti...,I have a party tonight and I took my last dose...,1
1,1,After how many hour from drinking an antibioti...,I vomited this morning and I am not sure if it...,0
2,1,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 19...,1
3,1,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or v...,0
4,1,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache wi...,0


In [3]:
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3048 entries, 0 to 3047
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   claster  3048 non-null   int64 
 1   lable    3048 non-null   object
 2   phrase   3048 non-null   object
 3   target   3048 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 95.4+ KB


None

In [4]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Encode phrases to TF-IDF vectors
phrase_vectors = vectorizer.fit_transform(df['phrase'].tolist())

# Function to find the 5 nearest neighbors
def find_nearest_neighbors(query, phrase_vectors, data, n_neighbors=5):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, phrase_vectors).flatten()
    nearest_indices = similarities.argsort()[-n_neighbors:][::-1]
    nearest_phrases = data['phrase'].iloc[nearest_indices].tolist()
    return nearest_phrases

# Example query
query = "What are normal levels of ANA in a healthy person?"
nearest_neighbors = find_nearest_neighbors(query, phrase_vectors, df)

nearest_neighbors

['What are normal levels of ANA in a healthy person?',
 'I have abnormal levels of T3 and normal levels of TSH/T4. What are my treatment options if I am currently pregnant?',
 'What are some conditions that can cause abnormal levels of T3 but normal levels of T4 and TSH? Can pregnancy also cause it?',
 'My vitamin D levels is 29. The doctor advised me to take 2000ius/day and maximum of sun in Fl, so that normal levels of 70-80 can be achieved. Could you tell me, how long would it take to reach the normal levels? ',
 'What does a value of ANA 1:160 and RNP 3 mean?']

In [6]:

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Encode phrases to TF-IDF vectors
phrase_vectors = vectorizer.fit_transform(df['phrase'].tolist())

# Function to find the 5 nearest neighbors
def find_nearest_neighbors(query, phrase_vectors, data, n_neighbors=5):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, phrase_vectors).flatten()
    nearest_indices = similarities.argsort()[-n_neighbors:][::-1]
    nearest_phrases = data['phrase'].iloc[nearest_indices].tolist()
    return nearest_phrases

# Example query
query = "What are normal levels of ANA in a healthy person?"
nearest_neighbors = find_nearest_neighbors(query, phrase_vectors, df)

nearest_neighbors

['What are normal levels of ANA in a healthy person?',
 'I have abnormal levels of T3 and normal levels of TSH/T4. What are my treatment options if I am currently pregnant?',
 'What are some conditions that can cause abnormal levels of T3 but normal levels of T4 and TSH? Can pregnancy also cause it?',
 'My vitamin D levels is 29. The doctor advised me to take 2000ius/day and maximum of sun in Fl, so that normal levels of 70-80 can be achieved. Could you tell me, how long would it take to reach the normal levels? ',
 'What does a value of ANA 1:160 and RNP 3 mean?']

In [7]:
# Function to add similarity scores to the DataFrame
def add_similarity_scores(query, phrase_vectors, data):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, phrase_vectors).flatten()
    data['similarity'] = similarities
    return data

# Example query
query = "What are normal levels of ANA in a healthy person?"
data_with_similarity = add_similarity_scores(query, phrase_vectors, df)

data_with_similarity.head()

Unnamed: 0,claster,lable,phrase,target,similarity
0,1,After how many hour from drinking an antibioti...,I have a party tonight and I took my last dose...,1,0.017511
1,1,After how many hour from drinking an antibioti...,I vomited this morning and I am not sure if it...,0,0.017564
2,1,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 19...,1,0.0
3,1,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or v...,0,0.020084
4,1,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache wi...,0,0.0
