# 0. Importing libraries

In [None]:
!pip install autocorrect



In [None]:
import pandas as pd
import numpy as np

import string as st
import re

from sklearn.linear_model import LogisticRegression

from nltk import PorterStemmer, WordNetLemmatizer
import nltk
from autocorrect import Speller 

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 1. Text Cleaning

In [None]:
raw_df = pd.read_csv("raw_dataset_extended.csv", sep=';')
raw_df.tail()

Unnamed: 0,text,query
357,Urate renal clearance/1.73 sq M in Urine and S...,urate in urine
358,Urate/Creatinine [Molar ratio] in Urine,urate in urine
359,Urate/Creatinine [Mass Ratio] in Urine,urate in urine
360,Urate/Creatinine [Mass Ratio] in 24 hour Urine,urate in urine
361,Urate/Creatinine [Molar ratio] in 24 hour Urine,urate in urine


In [None]:
len(raw_df)

362

Initial preprocessing of raw text

In [None]:
replacements = {
    'leukocyte': 'white blood cell',
}

In [None]:
def remove_punct(text):
    return ("".join([ch if ch not in st.punctuation else " " for ch in text ]))

def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

def remove_stopwords(text):
    # stopwords = [x for x in nltk.corpus.stopwords.words('english') if x not in forbidden]
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

def replace(text):
    return 

def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

def return_sentences(tokens):
    return " ".join([word for word in tokens])

def preprocess_element(text):
    return return_sentences(lemmatize(remove_stopwords(tokenize(remove_punct(text)))))

raw_df['text'] = raw_df['text'].apply(lambda x: preprocess_element(x))
raw_df['text'].replace(replacements, inplace=True, regex=True)
raw_df['query'] = raw_df['query'].apply(lambda x: preprocess_element(x))
raw_df.head()

Unnamed: 0,text,query
0,c reactive protein mass volume serum plasma,glucose blood
1,bicarbonate mole volume blood,glucose blood
2,rh type blood,glucose blood
3,trimethoprim sulfamethoxazole susceptibility,glucose blood
4,bilirubin total mass volume serum plasma,glucose blood


# 2. Dataset Generation

Previous calculus

In [None]:
queries = np.unique(raw_df['query'])
terms = ['glucose', 'blood', 'bilirubin', 'plasma', 'white', 'cell', 'count', 'urine', 'urate']
threshold = 20 # There are 68 documents for each query

# The number of documents with each term is precomputed
terms_frec = {}
for term in terms:
    terms_frec[term] = 0
    for doc in raw_df['text']:
        if term in doc:
            terms_frec[term] += 1
            
print(terms_frec)

# The number of documents is precomputed
num_docs = len(raw_df)

{'glucose': 129, 'blood': 48, 'bilirubin': 21, 'plasma': 92, 'white': 6, 'cell': 6, 'count': 0, 'urine': 163, 'urate': 49}


In [None]:
grouped_raw_df = raw_df.groupby('query')

raw_df['relevance'] = grouped_raw_df.cumcount()+1
raw_df['relevance'] = raw_df['relevance'].apply(lambda x: 1 if x <=threshold else 0)

In [None]:
raw_df.head(70)

Unnamed: 0,text,query,relevance
0,c reactive protein mass volume serum plasma,glucose blood,1
1,bicarbonate mole volume blood,glucose blood,1
2,rh type blood,glucose blood,1
3,trimethoprim sulfamethoxazole susceptibility,glucose blood,1
4,bilirubin total mass volume serum plasma,glucose blood,1
...,...,...,...
65,calcium mole volume corrected albumin serum pl...,glucose blood,0
66,antibiotic susceptibility,glucose blood,0
67,blood product unit id,bilirubin plasma,1
68,alanine aminotransferase enzymatic activity vo...,bilirubin plasma,1


In [None]:
def calculate_row(doc, query, term):
    # With this, the == operation performs correctly
    doc_arr = np.array(doc.split())
    query_arr = np.array(query.split())
    
    qaf = np.sum(query_arr == term)
    qrf = qaf/len(query_arr)
    
    daf = np.sum(doc_arr == term)
    if daf == 0:
        print(doc_arr,'/',term)
    drf = daf/len(doc_arr)
    
    idf = terms_frec[term]/num_docs
    
    # Not sure about this one
    rfad = terms_frec[term]/np.sum(list(terms_frec.values()))
    
    return list(np.log([qaf,qrf,daf,drf,idf,rfad]))

In [None]:
calculate_row('bicarbonate moles volume blood', 'glucose in blood', 'blood')

[0.0,
 -1.0986122886681098,
 0.0,
 -1.3862943611198906,
 -2.0204432009178803,
 -2.3710222545472743]

Generation of dataset

In [None]:
data = []
for row in raw_df.iterrows():
  doc = row[1][0]
  query = row[1][1]
  relevance = row[1][2]

  for term in re.split('\s+',query):
    if term in re.split('\s+',doc):
      data.append([doc,query,term,relevance])

dataset = pd.DataFrame(data, columns = ['document','query','term','relevance'])
dataset.head()

Unnamed: 0,document,query,term,relevance
0,bicarbonate mole volume blood,glucose blood,blood,1
1,rh type blood,glucose blood,blood,1
2,blood group antibody screen presence serum plasma,glucose blood,blood,1
3,glucose mole volume urine,glucose blood,glucose,1
4,lymphocyte volume blood,glucose blood,blood,1


In [None]:
data = []

for row in dataset.iterrows():
  doc = row[1][0]
  query = row[1][1]
  term = row[1][2]
  relevance = row[1][3]
  data.append(calculate_row(doc,query,term)+[relevance])

dataset_attributes = pd.DataFrame(data, columns = ['qaf','qrf','daf','drf','idf','rfad','relevance'])
dataset_attributes.head()

Unnamed: 0,qaf,qrf,daf,drf,idf,rfad,relevance
0,0.0,-0.693147,0.0,-1.386294,-2.020443,-2.371022,1
1,0.0,-0.693147,0.0,-1.098612,-2.020443,-2.371022,1
2,0.0,-0.693147,0.0,-1.94591,-2.020443,-2.371022,1
3,0.0,-0.693147,0.0,-1.386294,-1.031832,-1.382411,1
4,0.0,-0.693147,0.0,-1.098612,-2.020443,-2.371022,1


In [None]:
dataset_attributes.relevance.value_counts()

0    296
1    101
Name: relevance, dtype: int64

In [None]:
df = dataset_attributes

In [None]:
'''list_rows = []
indices = []

for row in raw_df.iterrows():
    doc = row[1][0]
    true_query = row[1][1]
    relevant = row[1][2]
    for query in queries:
        for term in terms:
            if term in re.split('\s+',doc) and term in re.split('\s+',query):
                list_rows.append(calculate_row(doc,query,term) + [relevant])
                indices.append((doc,query,term))


index = pd.MultiIndex.from_tuples(indices, names=["Doc", "Query", "Term"])
df = pd.DataFrame(list_rows, columns=['qaf','qrf','daf','drf','idf','rfad','relevance'],index=index)
df.head()'''

'list_rows = []\nindices = []\n\nfor row in raw_df.iterrows():\n    doc = row[1][0]\n    true_query = row[1][1]\n    relevant = row[1][2]\n    for query in queries:\n        for term in terms:\n            if term in re.split(\'\\s+\',doc) and term in re.split(\'\\s+\',query):\n                list_rows.append(calculate_row(doc,query,term) + [relevant])\n                indices.append((doc,query,term))\n\n\nindex = pd.MultiIndex.from_tuples(indices, names=["Doc", "Query", "Term"])\ndf = pd.DataFrame(list_rows, columns=[\'qaf\',\'qrf\',\'daf\',\'drf\',\'idf\',\'rfad\',\'relevance\'],index=index)\ndf.head()'

In [None]:
print(df)

     qaf       qrf  daf       drf       idf      rfad  relevance
0    0.0 -0.693147  0.0 -1.386294 -2.020443 -2.371022          1
1    0.0 -0.693147  0.0 -1.098612 -2.020443 -2.371022          1
2    0.0 -0.693147  0.0 -1.945910 -2.020443 -2.371022          1
3    0.0 -0.693147  0.0 -1.386294 -1.031832 -1.382411          1
4    0.0 -0.693147  0.0 -1.098612 -2.020443 -2.371022          1
..   ...       ...  ...       ...       ...       ...        ...
392  0.0 -0.693147  0.0 -1.609438 -0.797894 -1.148473          0
393  0.0 -0.693147  0.0 -1.945910 -1.999824 -2.350403          0
394  0.0 -0.693147  0.0 -1.945910 -0.797894 -1.148473          0
395  0.0 -0.693147  0.0 -1.945910 -1.999824 -2.350403          0
396  0.0 -0.693147  0.0 -1.945910 -0.797894 -1.148473          0

[397 rows x 7 columns]


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   qaf        397 non-null    float64
 1   qrf        397 non-null    float64
 2   daf        397 non-null    float64
 3   drf        397 non-null    float64
 4   idf        397 non-null    float64
 5   rfad       397 non-null    float64
 6   relevance  397 non-null    int64  
dtypes: float64(6), int64(1)
memory usage: 21.8 KB


In [None]:
[print(min(df[column])) for column in df.columns]

0.0
-1.3862943611198906
0.0
-2.5649493574615367
-4.099884742597716
-4.45046379622711
0


[None, None, None, None, None, None, None]

# 3. Model implementation

In [None]:
regressor = LogisticRegression(random_state = 3)

y = np.array(df['relevance'])
X = np.array(df.loc[:,df.columns != 'relevance'])

clf = regressor.fit(X,y)

In [None]:
print('Intercept: ', clf.intercept_, '\nCoefficients: ',clf.coef_)

Intercept:  [2.80467817] 
Coefficients:  [[ 0.          0.81844694 -1.60958123  1.81480656 -0.08577098 -0.08586791]]


In [None]:
prior = threshold/raw_df['text'].nunique()
print("Prior:", prior)
logprior = np.log(prior/(1-prior))
print("Log of prior:", logprior)

Prior: 0.08849557522123894
Log of prior: -2.33214389523559


In [None]:
def predict_relevance(clf,doc,query,terms):
    sumlogs = np.array([0.0])
    doc = preprocess_element(doc)
    query = preprocess_element(query)
    for term in terms:
        if term in query and term in doc:
            sumlogs += (clf.intercept_ + np.dot(clf.coef_,calculate_row(doc,query,term))) - logprior
    # It returns a 1x1 numpy array
    sumlogs = sumlogs + logprior
    return np.ndarray.item(1/(1+np.exp(-sumlogs)))

In [None]:
predict_relevance(clf,"Urate [Mass/volume] in 2 hour Urine","urate in urine",terms)

0.6992864560973644

# 4. Dataset output

In [None]:
df.to_csv('traindataset.csv')