# IREI: Search Engines & Real World Data
### Víctor Morcuende Castell and Guillermo Nájera Lavid
#### Course 2022-2023

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import ndcg_score

In [2]:
# Load data
glucose_df = pd.read_excel('loinc_dataset-v2.xlsx', sheet_name = "glucose in blood")
bilirubin_df = pd.read_excel('loinc_dataset-v2.xlsx', sheet_name = "bilirubin in plasma")
wbc_df = pd.read_excel('loinc_dataset-v2.xlsx', sheet_name = "White blood cells count")
glucose_target = glucose_df.pop('relevant')
bilirubin_target = bilirubin_df.pop('relevant')
wbc_target = wbc_df.pop('relevant')

In [3]:
# We eliminate irrelevant information
glucose_df.drop('comments', axis=1)
bilirubin_df.drop('comments', axis=1)
wbc_df.drop('comments', axis=1)

Unnamed: 0,loinc_num,long_common_name,component,system,property
0,33870-7,Bilirubin.total [Presence] in Unspecified spec...,Bilirubin,XXX,PrThr
1,29265-6,Calcium [Moles/volume] corrected for albumin i...,Calcium^^corrected for albumin,Ser/Plas,SCnc
2,14423-8,Bilirubin.total [Mass/volume] in Synovial fluid,Bilirubin,Synv fld,MCnc
3,23658-8,Other Antibiotic [Susceptibility],Antibiotic XXX,Isolate,Susc
4,19000-9,Vancomycin [Susceptibility],Vancomycin,Isolate,Susc
...,...,...,...,...,...
62,26484-6,Monocytes [#/volume] in Blood,Monocytes,Bld,NCnc
63,1250-0,Major crossmatch [interpretation],Major crossmatch,Ser/Plas,Imp
64,18864-9,Ampicillin [Susceptibility],Ampicillin,Isolate,Susc
65,1742-6,Alanine aminotransferase [Enzymatic activity/v...,Alanine aminotransferase,Ser/Plas,CCnc


In [4]:
# Using OneHotEncoder
ohe = preprocessing.OneHotEncoder(sparse_output=False)
glucose_df = pd.DataFrame(ohe.fit_transform(glucose_df),
                          columns=ohe.get_feature_names_out(glucose_df.columns.tolist()),
                          index=glucose_df.index)
bilirubin_df = pd.DataFrame(ohe.fit_transform(bilirubin_df),
                            columns=ohe.get_feature_names_out(bilirubin_df.columns.tolist()),
                            index=bilirubin_df.index)
wbc_df = pd.DataFrame(ohe.fit_transform(wbc_df),
                      columns=ohe.get_feature_names_out(wbc_df.columns.tolist()),
                      index=wbc_df.index)

WE TRAIN AN ADARANK MODEL USING THE METHOD IMPORTED FROM SKLEARN

Query: Glucose in Blood

In [5]:
iter = 0
while(True):
    iter = iter+1
    X_train, X_test, y_train, y_test = train_test_split(glucose_df, glucose_target, test_size=0.2)
    estimator = DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(estimator=estimator, n_estimators=100, loss='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ndcg = ndcg_score([y_test], [y_pred], k=X_test.shape[1])
    if ndcg >= 0.9:
        print('Model NDCG score:', ndcg)
        print("Iterations needed to achieve a score above 0.9:", iter)
        break


Model NDCG score: 0.9406070950390077
Iterations needed to achieve a score above 0.9: 8


Query: Bilirubin in Plasma

In [6]:
iter = 0
while(True):
    iter = iter+1
    X_train, X_test, y_train, y_test = train_test_split(bilirubin_df, bilirubin_target, test_size=0.2)
    estimator = DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(estimator=estimator, n_estimators=100, loss='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ndcg = ndcg_score([y_test], [y_pred], k=X_test.shape[1])
    if ndcg >= 0.9:
        print('Model NDCG score:', ndcg)
        print("Iterations needed to achieve a score above 0.9:", iter)
        break

Model NDCG score: 0.923928388719314
Iterations needed to achieve a score above 0.9: 1


Query: White Blood Cells Count

In [7]:
iter = 0
while(True):
    iter = iter+1
    X_train, X_test, y_train, y_test = train_test_split(wbc_df, wbc_target, test_size=0.2)
    estimator = DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(estimator=estimator, n_estimators=100, loss='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ndcg = ndcg_score([y_test], [y_pred], k=X_test.shape[1])
    if ndcg >= 0.9:
        print('Model NDCG score:', ndcg)
        print("Iterations needed to achieve a score above 0.9:", iter)
        break

Model NDCG score: 0.923928388719314
Iterations needed to achieve a score above 0.9: 64


#### EXTENDING THE DATASETS

In [8]:
# Load the new data
extended_glucose_df = pd.read_excel('extended_loinc_dataset-v2.xlsx', sheet_name = "glucose in blood")
extended_bilirubin_df = pd.read_excel('extended_loinc_dataset-v2.xlsx', sheet_name = "bilirubin in plasma")
extended_wbc_df = pd.read_excel('extended_loinc_dataset-v2.xlsx', sheet_name = "White blood cells count")
extended_glucose_target = extended_glucose_df.pop('relevant')
extended_bilirubin_target = extended_bilirubin_df.pop('relevant')
extended_wbc_target = extended_wbc_df.pop('relevant')

In [9]:
# We eliminate irrelevant information
extended_glucose_df.drop('comments', axis=1)
extended_bilirubin_df.drop('comments', axis=1)
extended_wbc_df.drop('comments', axis=1)

Unnamed: 0,loinc_num,long_common_name,component,system,property
0,33870-7,Bilirubin.total [Presence] in Unspecified spec...,Bilirubin,XXX,PrThr
1,29265-6,Calcium [Moles/volume] corrected for albumin i...,Calcium^^corrected for albumin,Ser/Plas,SCnc
2,14423-8,Bilirubin.total [Mass/volume] in Synovial fluid,Bilirubin,Synv fld,MCnc
3,23658-8,Other Antibiotic [Susceptibility],Antibiotic XXX,Isolate,Susc
4,19000-9,Vancomycin [Susceptibility],Vancomycin,Isolate,Susc
...,...,...,...,...,...
82,14385-9,Mononuclear cells/100 leukocytes in Gastric fl...,Mononuclear cells/100 leukocytes,Gast fld,NFr
83,74396-3,Large unstained cells/100 leukocytes in Cord b...,Large unstained cells/100 leukocytes,BldCo,NFr
84,31160-5,Polymorphonuclear cells/100 leukocytes in Bloo...,Polymorphonuclear cells/100 leukocytes,Bld,NFr
85,40650-4,Mesothelial cells/100 leukocytes in Blood by M...,Mesothelial cells/100 leukocytes,Bld,NFr


In [10]:
# Using OneHotEncoder
ohe = preprocessing.OneHotEncoder(sparse_output=False)
extended_glucose_df = pd.DataFrame(ohe.fit_transform(extended_glucose_df), 
                                   columns=ohe.get_feature_names_out(extended_glucose_df.columns.tolist()),
                                   index=extended_glucose_df.index)
extended_bilirubin_df = pd.DataFrame(ohe.fit_transform(extended_bilirubin_df), 
                                     columns=ohe.get_feature_names_out(extended_bilirubin_df.columns.tolist()),
                                     index=extended_bilirubin_df.index)
extended_wbc_df = pd.DataFrame(ohe.fit_transform(extended_wbc_df), 
                               columns=ohe.get_feature_names_out(extended_wbc_df.columns.tolist()),
                               index=extended_wbc_df.index)

WE TRAIN AN ADARANK MODEL USING THE METHOD IMPORTED FROM SKLEARN

Query: Glucose in Blood

In [11]:
iter = 0
while(True):
    iter = iter+1
    X_train, X_test, y_train, y_test = train_test_split(extended_glucose_df, extended_glucose_target, test_size=0.2)
    estimator = DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(estimator=estimator, n_estimators=100, loss='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ndcg = ndcg_score([y_test], [y_pred], k=X_test.shape[1])
    if ndcg >= 0.9:
        print('Model NDCG score:', ndcg)
        print("Iterations needed to achieve a score above 0.9:", iter)
        break

Model NDCG score: 0.9573320761578596
Iterations needed to achieve a score above 0.9: 1


Query: Bilirubin in Plasma

In [12]:
iter = 0
while(True):
    iter = iter+1
    X_train, X_test, y_train, y_test = train_test_split(extended_bilirubin_df, extended_bilirubin_target, test_size=0.2)
    estimator = DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(estimator=estimator, n_estimators=100, loss='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ndcg = ndcg_score([y_test], [y_pred], k=X_test.shape[1])
    if ndcg >= 0.9:
        print('Model NDCG score:', ndcg)
        print("Iterations needed to achieve a score above 0.9:", iter)
        break

Model NDCG score: 0.9655899797903305
Iterations needed to achieve a score above 0.9: 1


Query: White Blood Cells Count

In [13]:
iter = 0
while(True):
    iter = iter+1
    X_train, X_test, y_train, y_test = train_test_split(extended_wbc_df, extended_wbc_target, test_size=0.2)
    estimator = DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(estimator=estimator, n_estimators=100, loss='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ndcg = ndcg_score([y_test], [y_pred], k=X_test.shape[1])
    if ndcg >= 0.9:
        print('Model NDCG score:', ndcg)
        print("Iterations needed to achieve a score above 0.9:", iter)
        break

Model NDCG score: 0.9564129441125335
Iterations needed to achieve a score above 0.9: 1
