In [1]:
import pandas as pd
import numpy as np

## For multi-LABELS between CoronaWhy (cw) annotations and Kaggle (kg) annotations:
1. see which columns are shared between cw and kg (mostly Comp and Desc)
2. see common rows and look into how much kg can augment cw (pretty well) 
3. if kg does not augment, then drop shared rows (8 only) and add in other pub's kg annotated
4. extract out kaggle labels in rows as columns via 1-hot-encoding
7. add in empty columns to have equal columns between all 3 df's (cw, kg, shared)
8. make sure each df has columns in same order
9. concat all df's
10. should have around 1600 total annotations to create model

In [2]:
cw = pd.read_csv("cwclasses.csv")
list(cw)

['paper_id',
 'paper_txt',
 'sha',
 'in silico',
 'in vitro',
 'in vivo',
 'Systematic review or meta-analysis of RCTs',
 'RCT',
 'Non-randomized controlled trial',
 'Comparative study',
 'Descriptive study',
 'Systematic review or meta-analysis of studies other than RCTs',
 'Other ']

In [3]:
kg = pd.read_csv("kaggleclasses.csv")
list(kg)

['paper_id', 'paper_txt', 'label', 'id']

In [4]:
shared = cw.merge(kg, how='inner', left_on='paper_id', right_on='paper_id')
shared

Unnamed: 0,paper_id,paper_txt_x,sha,in silico,in vitro,in vivo,Systematic review or meta-analysis of RCTs,RCT,Non-randomized controlled trial,Comparative study,Descriptive study,Systematic review or meta-analysis of studies other than RCTs,Other,paper_txt_y,label,id
0,4bc6ead9368ec84dba7c56b78624f681293e568b,trend traumatic injury child visit emergency d...,4bc6ead9368ec84dba7c56b78624f681293e568b,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0,Characteristics and trends of traumatic injuri...,7,4bc6ead9368ec84dba7c56b78624f681293e568b
1,5a7988cd92e05a8bca504b5f1a467e2c8d3976d2,analysis transmission control mainland base ma...,5a7988cd92e05a8bca504b5f1a467e2c8d3976d2,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,Analysis of Transmission and Control of Tuberc...,10,5a7988cd92e05a8bca504b5f1a467e2c8d3976d2
2,5b15c62ee761d6eefeb7d2ef023f804bfdc41edc,optimal decision model hospital vacant school ...,5b15c62ee761d6eefeb7d2ef023f804bfdc41edc,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0,Optimal Decision Model for Sustainable Hospita...,9,5b15c62ee761d6eefeb7d2ef023f804bfdc41edc
3,5e9ccecdd40825f5c30da3a900fc5cf1b063b47d,infectious disease role tnf gene polymorphism ...,5e9ccecdd40825f5c30da3a900fc5cf1b063b47d,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,BMC Infectious Diseases Roles of TNF-α gene po...,8,5e9ccecdd40825f5c30da3a900fc5cf1b063b47d
4,766e0afdd0d3c2a1f8d9a6c8b318ca401db0a8bd,objective time become classic public health in...,766e0afdd0d3c2a1f8d9a6c8b318ca401db0a8bd,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,c Ann Arbor Veterans Affairs Hospital Objectiv...,9,766e0afdd0d3c2a1f8d9a6c8b318ca401db0a8bd
5,7ebedf373e4b3c414333bdd7be729cfbcab489b3,high reproduction number mathematical model ma...,7ebedf373e4b3c414333bdd7be729cfbcab489b3,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,High reproduction number of Middle East respir...,10,7ebedf373e4b3c414333bdd7be729cfbcab489b3
6,d7d62e768f4a8eb40ba522eca28e173e5d53cbbc,infection rheumatoid arthritis close new type ...,d7d62e768f4a8eb40ba522eca28e173e5d53cbbc,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1,COVID-19 infection and rheumatoid arthritis: F...,4,d7d62e768f4a8eb40ba522eca28e173e5d53cbbc
7,eac85c85c3a2ac1d3fe53e1e3b3fee98f52527ad,evaluation preparedness student volunteer seni...,eac85c85c3a2ac1d3fe53e1e3b3fee98f52527ad,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0,Evaluation of preparedness of healthcare stude...,7,eac85c85c3a2ac1d3fe53e1e3b3fee98f52527ad


There are a total of 8 records shared between cw and kg annotation sets.

In [5]:
#now need to get a better idea of how kg will augment cw by mapping kg classes to cw classes (multiclass to multilabel)
kg.explode('label')
kg['label'].unique()

array([ 5,  9, 10,  7,  2,  0,  4,  8,  1,  6,  3])

In [6]:
dic_map={
         1:'Systematic review or meta-analysis of RCTs',
         2:'RCT',
         3:'Non-randomized controlled trial',
         4:'Comparative study',
         5:'Comparative study',
         6:'Comparative study',
         7:'Descriptive study',
         8:'Comparative study',
         9:'Descriptive study',
         10:'in silico',
         0:'Other'
        }

In [7]:
kg['mapped'] = kg.apply(lambda x: dic_map[x.label], axis = 1) 
kg.head()

Unnamed: 0,paper_id,paper_txt,label,id,mapped
0,001d8d54a7e73e761f779c81661595cc5ae2ca08,Intervention time series analysis of crime rat...,5,001d8d54a7e73e761f779c81661595cc5ae2ca08,Comparative study
1,0043d044273b8eb1585d3a66061e9b4e03edc062,Evaluation of the tuberculosis programme in Ni...,9,0043d044273b8eb1585d3a66061e9b4e03edc062,Descriptive study
2,00911cf4f99a3d5ae5e5b787675646a743574496,CHEER: hierarCHical taxonomic classification f...,10,00911cf4f99a3d5ae5e5b787675646a743574496,in silico
3,00951716e01c8e0cc341770389fc38d1b5455210,"Knowledge of, attitudes toward, and preventive...",7,00951716e01c8e0cc341770389fc38d1b5455210,Descriptive study
4,012debf5a240a496518af146ddfc16c958339c2b,Preparedness and response against diseases wit...,9,012debf5a240a496518af146ddfc16c958339c2b,Descriptive study


In [8]:
shared2 = kg.merge(cw, how='inner', left_on='paper_id', right_on='paper_id')
shared2

Unnamed: 0,paper_id,paper_txt_x,label,id,mapped,paper_txt_y,sha,in silico,in vitro,in vivo,Systematic review or meta-analysis of RCTs,RCT,Non-randomized controlled trial,Comparative study,Descriptive study,Systematic review or meta-analysis of studies other than RCTs,Other
0,4bc6ead9368ec84dba7c56b78624f681293e568b,Characteristics and trends of traumatic injuri...,7,4bc6ead9368ec84dba7c56b78624f681293e568b,Descriptive study,trend traumatic injury child visit emergency d...,4bc6ead9368ec84dba7c56b78624f681293e568b,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
1,5a7988cd92e05a8bca504b5f1a467e2c8d3976d2,Analysis of Transmission and Control of Tuberc...,10,5a7988cd92e05a8bca504b5f1a467e2c8d3976d2,in silico,analysis transmission control mainland base ma...,5a7988cd92e05a8bca504b5f1a467e2c8d3976d2,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
2,5b15c62ee761d6eefeb7d2ef023f804bfdc41edc,Optimal Decision Model for Sustainable Hospita...,9,5b15c62ee761d6eefeb7d2ef023f804bfdc41edc,Descriptive study,optimal decision model hospital vacant school ...,5b15c62ee761d6eefeb7d2ef023f804bfdc41edc,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
3,5e9ccecdd40825f5c30da3a900fc5cf1b063b47d,BMC Infectious Diseases Roles of TNF-α gene po...,8,5e9ccecdd40825f5c30da3a900fc5cf1b063b47d,Comparative study,infectious disease role tnf gene polymorphism ...,5e9ccecdd40825f5c30da3a900fc5cf1b063b47d,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
4,766e0afdd0d3c2a1f8d9a6c8b318ca401db0a8bd,c Ann Arbor Veterans Affairs Hospital Objectiv...,9,766e0afdd0d3c2a1f8d9a6c8b318ca401db0a8bd,Descriptive study,objective time become classic public health in...,766e0afdd0d3c2a1f8d9a6c8b318ca401db0a8bd,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
5,7ebedf373e4b3c414333bdd7be729cfbcab489b3,High reproduction number of Middle East respir...,10,7ebedf373e4b3c414333bdd7be729cfbcab489b3,in silico,high reproduction number mathematical model ma...,7ebedf373e4b3c414333bdd7be729cfbcab489b3,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
6,d7d62e768f4a8eb40ba522eca28e173e5d53cbbc,COVID-19 infection and rheumatoid arthritis: F...,4,d7d62e768f4a8eb40ba522eca28e173e5d53cbbc,Comparative study,infection rheumatoid arthritis close new type ...,d7d62e768f4a8eb40ba522eca28e173e5d53cbbc,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1
7,eac85c85c3a2ac1d3fe53e1e3b3fee98f52527ad,Evaluation of preparedness of healthcare stude...,7,eac85c85c3a2ac1d3fe53e1e3b3fee98f52527ad,Descriptive study,evaluation preparedness student volunteer seni...,eac85c85c3a2ac1d3fe53e1e3b3fee98f52527ad,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0


In [9]:
list(shared2)

['paper_id',
 'paper_txt_x',
 'label',
 'id',
 'mapped',
 'paper_txt_y',
 'sha',
 'in silico',
 'in vitro',
 'in vivo',
 'Systematic review or meta-analysis of RCTs',
 'RCT',
 'Non-randomized controlled trial',
 'Comparative study',
 'Descriptive study',
 'Systematic review or meta-analysis of studies other than RCTs',
 'Other ']

In [10]:
shared2.drop(columns=['paper_id',
                      'paper_txt_x',
                      'label',
                      'id',
                      'paper_txt_y',
                      'sha'])

Unnamed: 0,mapped,in silico,in vitro,in vivo,Systematic review or meta-analysis of RCTs,RCT,Non-randomized controlled trial,Comparative study,Descriptive study,Systematic review or meta-analysis of studies other than RCTs,Other
0,Descriptive study,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
1,in silico,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
2,Descriptive study,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
3,Comparative study,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
4,Descriptive study,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
5,in silico,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
6,Comparative study,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1
7,Descriptive study,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0


Based on this small sample, looks like kg annotations would augment cw annotations since 4 pub's (50%) would have more labels than presented if just left with cw annotations. This would greatly enhance annotations dataset.

(Before proceeding to one-hot-encoding for non-shared kg records, need to take out shared paper id's from cw and kg since shared will be added to union/concat between cw and kg as final step.)

In [11]:
shared2 = shared2.drop(columns = [
                      'paper_txt_x',
                      'label',
                      'id',
                      'paper_txt_y',
                      'sha'])
shared2

Unnamed: 0,paper_id,mapped,in silico,in vitro,in vivo,Systematic review or meta-analysis of RCTs,RCT,Non-randomized controlled trial,Comparative study,Descriptive study,Systematic review or meta-analysis of studies other than RCTs,Other
0,4bc6ead9368ec84dba7c56b78624f681293e568b,Descriptive study,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
1,5a7988cd92e05a8bca504b5f1a467e2c8d3976d2,in silico,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
2,5b15c62ee761d6eefeb7d2ef023f804bfdc41edc,Descriptive study,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
3,5e9ccecdd40825f5c30da3a900fc5cf1b063b47d,Comparative study,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
4,766e0afdd0d3c2a1f8d9a6c8b318ca401db0a8bd,Descriptive study,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
5,7ebedf373e4b3c414333bdd7be729cfbcab489b3,in silico,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
6,d7d62e768f4a8eb40ba522eca28e173e5d53cbbc,Comparative study,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1
7,eac85c85c3a2ac1d3fe53e1e3b3fee98f52527ad,Descriptive study,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0


In [12]:
columns = list(shared2.iloc[:,2:])
columns

['in silico',
 'in vitro',
 'in vivo',
 'Systematic review or meta-analysis of RCTs',
 'RCT',
 'Non-randomized controlled trial',
 'Comparative study',
 'Descriptive study',
 'Systematic review or meta-analysis of studies other than RCTs',
 'Other ']

In [13]:
count = 0
for i in shared2['mapped']:
    for c in columns:
        if i == c and shared2[c][count] == 1:
            pass
        elif i == c and shared2[c][count] != 1:
            shared2.at[count,c] = 1
    count +=1

In [14]:
shared2

Unnamed: 0,paper_id,mapped,in silico,in vitro,in vivo,Systematic review or meta-analysis of RCTs,RCT,Non-randomized controlled trial,Comparative study,Descriptive study,Systematic review or meta-analysis of studies other than RCTs,Other
0,4bc6ead9368ec84dba7c56b78624f681293e568b,Descriptive study,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
1,5a7988cd92e05a8bca504b5f1a467e2c8d3976d2,in silico,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
2,5b15c62ee761d6eefeb7d2ef023f804bfdc41edc,Descriptive study,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
3,5e9ccecdd40825f5c30da3a900fc5cf1b063b47d,Comparative study,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0
4,766e0afdd0d3c2a1f8d9a6c8b318ca401db0a8bd,Descriptive study,1.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
5,7ebedf373e4b3c414333bdd7be729cfbcab489b3,in silico,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
6,d7d62e768f4a8eb40ba522eca28e173e5d53cbbc,Comparative study,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,1
7,eac85c85c3a2ac1d3fe53e1e3b3fee98f52527ad,Descriptive study,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0


In [15]:
#Remove paper_id listed here 

In [16]:
unwanted = shared2['paper_id']
unwanted

0    4bc6ead9368ec84dba7c56b78624f681293e568b
1    5a7988cd92e05a8bca504b5f1a467e2c8d3976d2
2    5b15c62ee761d6eefeb7d2ef023f804bfdc41edc
3    5e9ccecdd40825f5c30da3a900fc5cf1b063b47d
4    766e0afdd0d3c2a1f8d9a6c8b318ca401db0a8bd
5    7ebedf373e4b3c414333bdd7be729cfbcab489b3
6    d7d62e768f4a8eb40ba522eca28e173e5d53cbbc
7    eac85c85c3a2ac1d3fe53e1e3b3fee98f52527ad
Name: paper_id, dtype: object

In [17]:
cw.shape

(939, 13)

In [18]:
cw_final = cw[~cw.paper_id.isin(unwanted)]

In [19]:
cw_final.shape

(931, 13)

In [20]:
kg.shape

(739, 5)

In [21]:
kg_final = kg[~kg.paper_id.isin(unwanted)]

In [22]:
kg_final.shape

(731, 5)

In [23]:
#perform one-hot-encoding on kg dataset

one_hot = pd.get_dummies(kg_final['mapped'])
#drop mapped column as it is encoded now
kg_final = kg_final.drop('mapped',axis = 1)
# Join the encoded mapped to df3
kg_final = kg_final.join(one_hot)
kg_final

Unnamed: 0,paper_id,paper_txt,label,id,Comparative study,Descriptive study,Non-randomized controlled trial,Other,RCT,Systematic review or meta-analysis of RCTs,in silico
0,001d8d54a7e73e761f779c81661595cc5ae2ca08,Intervention time series analysis of crime rat...,5,001d8d54a7e73e761f779c81661595cc5ae2ca08,1,0,0,0,0,0,0
1,0043d044273b8eb1585d3a66061e9b4e03edc062,Evaluation of the tuberculosis programme in Ni...,9,0043d044273b8eb1585d3a66061e9b4e03edc062,0,1,0,0,0,0,0
2,00911cf4f99a3d5ae5e5b787675646a743574496,CHEER: hierarCHical taxonomic classification f...,10,00911cf4f99a3d5ae5e5b787675646a743574496,0,0,0,0,0,0,1
3,00951716e01c8e0cc341770389fc38d1b5455210,"Knowledge of, attitudes toward, and preventive...",7,00951716e01c8e0cc341770389fc38d1b5455210,0,1,0,0,0,0,0
4,012debf5a240a496518af146ddfc16c958339c2b,Preparedness and response against diseases wit...,9,012debf5a240a496518af146ddfc16c958339c2b,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
734,ff08601026bfcad65343aeac47487be948da1985,Journal Pre-proof Physical exercise as therapy...,0,ff08601026bfcad65343aeac47487be948da1985,0,0,0,1,0,0,0
735,ff365ebbc0fc55476886b0abd129e227c1f8a527,Article focus Hip We report a systematic revie...,1,ff365ebbc0fc55476886b0abd129e227c1f8a527,0,0,0,0,0,1,0
736,ff4fffe02138b0b232334d997965d11fd936916b,BMC Medicine RNA viruses in community-acquired...,7,ff4fffe02138b0b232334d997965d11fd936916b,0,1,0,0,0,0,0
737,ff849fc6eb55db14fe29a58c4b982f938eeb2e32,"Arthroscopy, Arthroscopy Techniques, and Arthr...",0,ff849fc6eb55db14fe29a58c4b982f938eeb2e32,0,0,0,1,0,0,0


In [28]:
#formatting cw_final for concat (Pandas equivalent to SQL Union (duplicates could have been removed here using duplicates = True param but done at an earlier step))
cw_final = cw_final.drop(columns=['paper_txt', 'sha'])

In [29]:
cw_final = cw_final.rename(columns={'Other ': 'Other'})
cw_final

Unnamed: 0,paper_id,in silico,in vitro,in vivo,Systematic review or meta-analysis of RCTs,RCT,Non-randomized controlled trial,Comparative study,Descriptive study,Systematic review or meta-analysis of studies other than RCTs,Other
0,002faa16056227423f9d108d01f0dae31491510c,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
1,0087db1b3fdc81bb42d1fc625fc202c881dd9b9f,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
2,00b88130d2a7c8489e209742494303b6731d7544,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0.0,0
3,0116dea9b6d4c58748d1a38b69b4fe6e06868aa4,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
4,012d67a440ec30896e8c89c67a736c2638512be1,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
934,fed900e545d163ace2bd198f44d0e87375310739,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
935,fee63d10e8db56b72c9385149a4e57afa8500981,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0.0,0
936,ff9d9713206da30022af4f9095058368eeb1f3f8,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1
937,ffd0af549110e997ac78508b499668e31e3b5b90,0.0,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0


In [24]:
#formatting kg_final for concat 

#drop unnecessary columns
kg_final = kg_final.drop(columns=['paper_txt', 'label', 'id'])

In [31]:
#add blank columns to equal 10 annotation columns
c = {'in vitro': 0, 'in vivo': 0, 'Systematic review or meta-analysis of studies other than RCTs': 0}
kg_final = kg_final.assign(**c)
#reorder columns to be the same as cw_final and eventually shared2_final
cols = list(cw_final)
kg_final = kg_final[cols]
kg_final

Unnamed: 0,paper_id,in silico,in vitro,in vivo,Systematic review or meta-analysis of RCTs,RCT,Non-randomized controlled trial,Comparative study,Descriptive study,Systematic review or meta-analysis of studies other than RCTs,Other
0,001d8d54a7e73e761f779c81661595cc5ae2ca08,0,0,0,0,0,0,1,0,0,0
1,0043d044273b8eb1585d3a66061e9b4e03edc062,0,0,0,0,0,0,0,1,0,0
2,00911cf4f99a3d5ae5e5b787675646a743574496,1,0,0,0,0,0,0,0,0,0
3,00951716e01c8e0cc341770389fc38d1b5455210,0,0,0,0,0,0,0,1,0,0
4,012debf5a240a496518af146ddfc16c958339c2b,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
734,ff08601026bfcad65343aeac47487be948da1985,0,0,0,0,0,0,0,0,0,1
735,ff365ebbc0fc55476886b0abd129e227c1f8a527,0,0,0,1,0,0,0,0,0,0
736,ff4fffe02138b0b232334d997965d11fd936916b,0,0,0,0,0,0,0,1,0,0
737,ff849fc6eb55db14fe29a58c4b982f938eeb2e32,0,0,0,0,0,0,0,0,0,1


In [32]:
#ensuring all column names in kg_final are equivalent to cw_final
list(kg_final)

['paper_id',
 'in silico',
 'in vitro',
 'in vivo',
 'Systematic review or meta-analysis of RCTs',
 'RCT',
 'Non-randomized controlled trial',
 'Comparative study',
 'Descriptive study',
 'Systematic review or meta-analysis of studies other than RCTs',
 'Other']

In [33]:
#formatting shared2 for concat 
shared2_final = shared2.drop(columns=['mapped'])
shared2_final = shared2_final.rename(columns={'Other ': 'Other'})
shared2_final

Unnamed: 0,paper_id,in silico,in vitro,in vivo,Systematic review or meta-analysis of RCTs,RCT,Non-randomized controlled trial,Comparative study,Descriptive study,Systematic review or meta-analysis of studies other than RCTs,Other
0,4bc6ead9368ec84dba7c56b78624f681293e568b,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
1,5a7988cd92e05a8bca504b5f1a467e2c8d3976d2,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
2,5b15c62ee761d6eefeb7d2ef023f804bfdc41edc,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
3,5e9ccecdd40825f5c30da3a900fc5cf1b063b47d,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0
4,766e0afdd0d3c2a1f8d9a6c8b318ca401db0a8bd,1.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
5,7ebedf373e4b3c414333bdd7be729cfbcab489b3,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
6,d7d62e768f4a8eb40ba522eca28e173e5d53cbbc,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,1
7,eac85c85c3a2ac1d3fe53e1e3b3fee98f52527ad,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0


In [34]:
#concat cw_final, kg_final and shared2_final
pdList = [kg_final, cw_final, shared2_final]  # List of your dataframes
annotations_merged = pd.concat(pdList)
annotations_merged

Unnamed: 0,paper_id,in silico,in vitro,in vivo,Systematic review or meta-analysis of RCTs,RCT,Non-randomized controlled trial,Comparative study,Descriptive study,Systematic review or meta-analysis of studies other than RCTs,Other
0,001d8d54a7e73e761f779c81661595cc5ae2ca08,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0
1,0043d044273b8eb1585d3a66061e9b4e03edc062,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
2,00911cf4f99a3d5ae5e5b787675646a743574496,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
3,00951716e01c8e0cc341770389fc38d1b5455210,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
4,012debf5a240a496518af146ddfc16c958339c2b,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
3,5e9ccecdd40825f5c30da3a900fc5cf1b063b47d,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0
4,766e0afdd0d3c2a1f8d9a6c8b318ca401db0a8bd,1.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0
5,7ebedf373e4b3c414333bdd7be729cfbcab489b3,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
6,d7d62e768f4a8eb40ba522eca28e173e5d53cbbc,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,1


In [35]:
annotations_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1670 entries, 0 to 7
Data columns (total 11 columns):
 #   Column                                                         Non-Null Count  Dtype  
---  ------                                                         --------------  -----  
 0   paper_id                                                       1670 non-null   object 
 1   in silico                                                      1670 non-null   float64
 2   in vitro                                                       1670 non-null   float64
 3   in vivo                                                        1670 non-null   float64
 4   Systematic review or meta-analysis of RCTs                     1670 non-null   float64
 5   RCT                                                            1670 non-null   float64
 6   Non-randomized controlled trial                                1670 non-null   float64
 7   Comparative study                                              

1670 total annotations now!! (Before, it was 938 with cw annotations only).

In [207]:
#What you've been waiting for:
#annotations_merged.to_csv("annotations_merged.csv", index=False)