# IREI: Search Engines & Real World Data
### Víctor Morcuende Castell and Guillermo Nájera Lavid
#### Course 2022-2023

In [624]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import ndcg_score


In [625]:
# Load data
glucose_df = pd.read_excel('loinc_dataset-v2.xlsx', sheet_name = "glucose in blood")
bilirubin_df = pd.read_excel('loinc_dataset-v2.xlsx', sheet_name = "bilirubin in plasma")
wbc_df = pd.read_excel('loinc_dataset-v2.xlsx', sheet_name = "White blood cells count")
glucose_target = glucose_df.pop('relevant')
bilirubin_target = bilirubin_df.pop('relevant')
wbc_target = wbc_df.pop('relevant')

In [626]:
# We eliminate irrelevant information
glucose_df.drop('comments', axis=1)
bilirubin_df.drop('comments', axis=1)
wbc_df.drop('comments', axis=1)

Unnamed: 0,loinc_num,long_common_name,component,system,property
0,33870-7,Bilirubin.total [Presence] in Unspecified spec...,Bilirubin,XXX,PrThr
1,29265-6,Calcium [Moles/volume] corrected for albumin i...,Calcium^^corrected for albumin,Ser/Plas,SCnc
2,14423-8,Bilirubin.total [Mass/volume] in Synovial fluid,Bilirubin,Synv fld,MCnc
3,23658-8,Other Antibiotic [Susceptibility],Antibiotic XXX,Isolate,Susc
4,19000-9,Vancomycin [Susceptibility],Vancomycin,Isolate,Susc
...,...,...,...,...,...
62,26484-6,Monocytes [#/volume] in Blood,Monocytes,Bld,NCnc
63,1250-0,Major crossmatch [interpretation],Major crossmatch,Ser/Plas,Imp
64,18864-9,Ampicillin [Susceptibility],Ampicillin,Isolate,Susc
65,1742-6,Alanine aminotransferase [Enzymatic activity/v...,Alanine aminotransferase,Ser/Plas,CCnc


In [627]:
# Using OneHotEncoder
ohe = preprocessing.OneHotEncoder(sparse=False)
glucose_df = pd.DataFrame(ohe.fit_transform(glucose_df), columns=ohe.get_feature_names_out(glucose_df.columns.tolist()),index=glucose_df.index)
bilirubin_df = pd.DataFrame(ohe.fit_transform(bilirubin_df), columns=ohe.get_feature_names_out(bilirubin_df.columns.tolist()),index=bilirubin_df.index)
wbc_df = pd.DataFrame(ohe.fit_transform(wbc_df), columns=ohe.get_feature_names_out(wbc_df.columns.tolist()),index=wbc_df.index)

WE TRAIN AN ADARANK MODEL USING THE METHOD IMPORTED FROM SKLEARN

GLUCOSE IN BLOOD

In [628]:
iter = 0
while(True):
    iter = iter+1
    X_train, X_test, y_train, y_test = train_test_split(glucose_df, glucose_target, test_size=0.2)
    base_estimator = DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(base_estimator=base_estimator, n_estimators=100, loss='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ndcg = ndcg_score([y_test], [y_pred], k=X_test.shape[1])
    if ndcg >= 0.9:
        print('Model NDCG score:', ndcg)
        print("Iterations needed to achieve a score above 0.9:", iter)
        break


Model NDCG score: 0.923928388719314
Iterations needed to achieve a score above 0.9: 1


BILIRUBIN IN PLASMA

In [629]:
iter = 0
while(True):
    iter = iter+1
    X_train, X_test, y_train, y_test = train_test_split(bilirubin_df, bilirubin_target, test_size=0.2)
    base_estimator = DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(base_estimator=base_estimator, n_estimators=100, loss='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ndcg = ndcg_score([y_test], [y_pred], k=X_test.shape[1])
    if ndcg >= 0.9:
        print('Model NDCG score:', ndcg)
        print("Iterations needed to achieve a score above 0.9:", iter)
        break

Model NDCG score: 0.9021529740555758
Iterations needed to achieve a score above 0.9: 6


WHITE BLOOD CELLS COUNT

In [630]:
iter = 0
while(True):
    iter = iter+1
    X_train, X_test, y_train, y_test = train_test_split(wbc_df, wbc_target, test_size=0.2)
    base_estimator = DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(base_estimator=base_estimator, n_estimators=100, loss='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ndcg = ndcg_score([y_test], [y_pred], k=X_test.shape[1])
    if ndcg >= 0.9:
        print('Model NDCG score:', ndcg)
        print("Iterations needed to achieve a score above 0.9:", iter)
        break

Model NDCG score: 1.0
Iterations needed to achieve a score above 0.9: 67
