In [22]:
import tqdm
import pandas as pd
from glob import glob
from utils.trec_wrapper import parse_run, calculate_scaling_factor
from trectools import TrecQrel
import os

In [23]:
files = glob("../outputs/ablations.txt_*") 
qrel = "../assets/qrels-clinical_trials.txt"

metrics = {
    'recall': [1000]
}

all_runs = {}

for met, depths in metrics.items():
    for depth in depths:
        runs = {}    
        metric = None
        for fp in files: 
            parsed_run = parse_run(fp, qrel, metric=met, depth=depth)
            fn = os.path.split(fp)[-1]

            name = fn.replace('.txt', '')
            runs[name] = parsed_run.run
            metric = parsed_run.metric

        all_runs[metric] = runs

df = pd.concat({k: pd.DataFrame(v).T for k, v in all_runs.items()}, axis=0)
df_melt = pd.melt(df.T, ignore_index=False).reset_index().rename(columns={"index": "topic_num"})
df_melt

Unnamed: 0,topic_num,variable_0,variable_1,value
0,20141,recall@1000,ablations_0,
1,20142,recall@1000,ablations_0,
2,20143,recall@1000,ablations_0,
3,20144,recall@1000,ablations_0,
4,20145,recall@1000,ablations_0,
...,...,...,...,...
5954,201526,recall@1000,ablations_99,
5955,201527,recall@1000,ablations_99,
5956,201528,recall@1000,ablations_99,
5957,201529,recall@1000,ablations_99,


In [24]:
mappings = ['HasExpandedAccess', 'BriefSummary.Textblock', 'CompletionDate.Type', 'OversightInfo.Text',     'OverallContactBackup.PhoneExt', 'RemovedCountries.Text', 'SecondaryOutcome', 'Sponsors.LeadSponsor.Text', 'BriefTitl    e', 'IDInfo.NctID', 'IDInfo.SecondaryID', 'OverallContactBackup.Phone', 'Eligibility.StudyPop.Textblock', 'DetailedDe    scription.Textblock', 'Eligibility.MinimumAge', 'Sponsors.Collaborator', 'Reference', 'Eligibility.Criteria.Textblock    ', 'XMLName.Space', 'Rank', 'OverallStatus', 'InterventionBrowse.Text', 'Eligibility.Text', 'Intervention', 'BiospecD    escr.Textblock', 'ResponsibleParty.NameTitle', 'NumberOfArms', 'ResponsibleParty.ResponsiblePartyType', 'IsSection801    ', 'Acronym', 'Eligibility.MaximumAge', 'DetailedDescription.Text', 'StudyDesign', 'OtherOutcome', 'VerificationDate'    , 'ConditionBrowse.MeshTerm', 'Enrollment.Text', 'IDInfo.Text', 'ConditionBrowse.Text', 'FirstreceivedDate', 'NumberO    fGroups', 'OversightInfo.HasDmc', 'PrimaryCompletionDate.Text', 'ResultsReference', 'Eligibility.StudyPop.Text', 'IsF    daRegulated', 'WhyStopped', 'ArmGroup', 'OverallContact.LastName', 'Phase', 'RemovedCountries.Country', 'Intervention    Browse.MeshTerm', 'Eligibility.HealthyVolunteers', 'Location', 'OfficialTitle', 'OverallContact.Email', 'RequiredHeader.Text', 'RequiredHeader.URL', 'LocationCountries.Country', 'OverallContact.PhoneExt', 'Condition', 'PrimaryOutcome'    , 'LocationCountries.Text', 'BiospecDescr.Text', 'IDInfo.OrgStudyID', 'Link', 'OverallContact.Phone', 'Source', 'Resp    onsibleParty.InvestigatorAffiliation', 'StudyType', 'FirstreceivedResultsDate', 'Enrollment.Type', 'Eligibility.Gende    r', 'OverallContactBackup.LastName', 'Keyword', 'BiospecRetention', 'CompletionDate.Text', 'OverallContact.Text', 'Re    quiredHeader.DownloadDate', 'Sponsors.Text', 'Text', 'Eligibility.SamplingMethod', 'LastchangedDate', 'ResponsiblePar    ty.InvestigatorFullName', 'StartDate', 'RequiredHeader.LinkText', 'OverallOfficial', 'Sponsors.LeadSponsor.AgencyClas    s', 'OverallContactBackup.Text', 'Eligibility.Criteria.Text', 'XMLName.Local', 'OversightInfo.Authority', 'PrimaryCom    pletionDate.Type', 'ResponsibleParty.Organization', 'IDInfo.NctAlias', 'ResponsibleParty.Text', 'TargetDuration', 'Sp    onsors.LeadSponsor.Agency', 'BriefSummary.Text', 'OverallContactBackup.Email', 'ResponsibleParty.InvestigatorTitle']


In [25]:
mappings = list(map(lambda k: k.replace(' ', ''), mappings))

In [26]:
for index, row in df_melt.iterrows():
    s = ""
    name = row.variable_1
    
    df_melt.loc[index, 'variable_1'] = mappings[int(name.split('_')[-1])]
    
df_melt = df_melt.fillna(0)
df_melt

Unnamed: 0,topic_num,variable_0,variable_1,value
0,20141,recall@1000,HasExpandedAccess,0.0
1,20142,recall@1000,HasExpandedAccess,0.0
2,20143,recall@1000,HasExpandedAccess,0.0
3,20144,recall@1000,HasExpandedAccess,0.0
4,20145,recall@1000,HasExpandedAccess,0.0
...,...,...,...,...
5954,201526,recall@1000,OverallContactBackup.Email,0.0
5955,201527,recall@1000,OverallContactBackup.Email,0.0
5956,201528,recall@1000,OverallContactBackup.Email,0.0
5957,201529,recall@1000,OverallContactBackup.Email,0.0


In [27]:
df = df_melt.sort_values(by='value', ascending=False).fillna(0)
df_flat = df[df['value'] > 0.0].groupby(['variable_1']).mean().sort_values(by='value', ascending=False)
df_flat

Unnamed: 0_level_0,topic_num,value
variable_1,Unnamed: 1_level_1,Unnamed: 2_level_1
Eligibility.Gender,201526.0,0.25
LocationCountries.Country,128934.8,0.238196
DetailedDescription.Textblock,150224.5,0.228902
BriefSummary.Textblock,139554.585366,0.213836
ConditionBrowse.MeshTerm,153750.157895,0.201614
Eligibility.Criteria.Textblock,148984.157895,0.18338
InterventionBrowse.MeshTerm,97866.714286,0.182976
StudyType,201424.0,0.166667
IsFdaRegulated,201514.0,0.166667
HasExpandedAccess,201514.0,0.166667


In [30]:
df_flat[df_flat['value'] > 0.09].index

Index(['Eligibility.Gender', 'LocationCountries.Country',
       'DetailedDescription.Textblock', 'BriefSummary.Textblock',
       'ConditionBrowse.MeshTerm', 'Eligibility.Criteria.Textblock',
       'InterventionBrowse.MeshTerm', 'StudyType', 'IsFdaRegulated',
       'HasExpandedAccess', 'RequiredHeader.LinkText', 'BiospecRetention',
       'OfficialTitle', 'Eligibility.SamplingMethod',
       'Eligibility.StudyPop.Textblock', 'Condition', 'Eligibility.MinimumAge',
       'Keyword', 'Eligibility.MaximumAge', 'BriefTitle'],
      dtype='object', name='variable_1')