# Anatomical site extraction from QUICKUMLS algorithm output dataset: workaround and results

In [1]:
# sample scripts
import pandas as pd
import os
import spacy
from spacy import displacy
import ast
data_dir = os.getcwd()
#print(data_dir)
cohort_df = pd.read_csv(data_dir + '/data/chrtDf.csv')
Pnotes_df = pd.read_pickle('PnotesNoDuplicate.pkl')
RadPath_df = pd.read_pickle('RadPathNoDuplicate.pkl')
SurgPath_df = pd.read_pickle('SurgPathNoDuplicate.pkl')
cohort_df.rename(columns =  {'tnm_mixed_stage_desc' : 'levels'},inplace= True)
cohort_df = cohort_df.loc[cohort_df['pat_id']!='NOPE']
pat_count = cohort_df.groupby('pat_id').size().sort_values(ascending=False)
# single record seperation
patsWithOne = pat_count.loc[pat_count == 1]
chrtDfOne = cohort_df.loc[cohort_df['pat_id'].isin(list(patsWithOne.index))].reset_index()

#Remove records with no labels or missing labels

chrtDfOne.where(((chrtDfOne['levels'] != 'Unknown') & (chrtDfOne['levels'] != 'Missing')& (chrtDfOne['levels'] != 'Not Applicable')), inplace = True )
chrtDfOne=chrtDfOne.reset_index()


# Check for any missing levels

In [None]:
chrtDfOne.groupby('levels').size()

# Check for the field that contains QUICKUMLS output from the free texts in Pnotes, RadPath and SurgPath 
## "nlp_text_qu_ner"

In [None]:
#Sample database info from RadPath 
RadPath_df.info()


# Drop any null value in 'nlp_text_qu_ner'

In [2]:
SurgPath_df.dropna(subset = ['nlp_text_qu_ner'], inplace = True)
RadPath_df.dropna(subset = ['nlp_text_qu_ner'], inplace = True)
Pnotes_df.dropna(subset = ['nlp_text_qu_ner'], inplace = True)
RadPath_df = RadPath_df[RadPath_df['nlp_text_qu_ner']!='[]']
RadPath_df.reset_index(inplace = True)
Pnotes_df = Pnotes_df[Pnotes_df['nlp_text_qu_ner']!='[]']
Pnotes_df.reset_index(inplace = True)



# Test purpose code snippets

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
pat_id = 369
text =SurgPath_df.loc[pat_id, 'nlp_text_qu_ner']
d = ast.literal_eval(text)
Testframe = pd.DataFrame.from_dict(d)
corpus= Testframe[Testframe['semtypestring'] == 'T033']['ngram'].tolist()
pat_id = SurgPath_df.loc[877, 'pat_id']
print(corpus)
#corpus = [l for l in corpus if len(l.split(' ')) >= 2]
if corpus != []:
# #corpus = ['This is the first document.','This document is the second document.','And this is the third one.',\
#           'Is this the first document?']
    #vectorizer = CountVectorizer()
#     print(corpus)
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    print(vectorizer.get_feature_names())
    print(X.toarray())
#     print(X.toarray(),axis =0).max())
    print(vectorizer.get_feature_names()[np.sum(X.toarray(),axis =0).argmax()])
else:
    print('Empty corpus')
    
    
chrtDfOne[chrtDfOne['pat_id']== pat_id][['levels', 'site_group', 'vital_status']]

# List the anatomical sites mentioned in the main cohort (chrtDfOne)

In [3]:
#Find any NULL site group and remove from the list
chrtDfOne.groupby('site_group').size()

chrtDfOne['site_group'].isnull().sum()
chrtDfOne = chrtDfOne.dropna(subset = ['site_group']).reset_index(drop= True)


# Assign the site group in to a list for list comprehension

In [4]:
SiteList =chrtDfOne['site_group'].unique().tolist()
SiteList

['LUNG/BRONCHUS-NON SM CELL',
 'BREAST',
 "NON-HODGKIN'S LYMPHOMA",
 'PROSTATE',
 'CORPUS UTERI',
 'BLADDER',
 'PANCREAS',
 'OVARY',
 'LUNG/BRONCHUS-SMALL CELL',
 'STOMACH',
 "HODGKIN'S DISEASE",
 'UTERUS NOS']

# The performance based on different site group
## - Creating dataframes for each site using dict comprehension
## - Access each dataframe by keys from the dictionary

In [5]:
# define the columns of interest
col2use = ['pat_id', 'site_group', 'histology_desc', 'levels']
# Use dict comprehension to create dfs  for each site group
dfs_sitegroups = {'chrt_' + str(site[0:8]):chrtDfOne[chrtDfOne['site_group']== site][col2use].reset_index(drop= True) for site in SiteList}

# Working with the free text reports 
## FROM SURGPATH : using countvectorizer and TF_IDF vectorizer to find the maximum occurance of a word from the list of words in the site report


In [7]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_TFIDF = TfidfVectorizer()
vectorizer_CV = CountVectorizer()
df_anatomical_merged = pd.DataFrame({'pat_id': [], 'anatomical site_CV(T023)': [], 'anatomical site_TFIDF(T023)': []})
#for i in range(1,200):
for i in range(len(SurgPath_df.index)):
    text = SurgPath_df.iloc[i].nlp_text_qu_ner
    d = ast.literal_eval(text)
    Testframe = pd.DataFrame.from_dict(d)
    
    corpus_T023 = Testframe[Testframe['semtypestring'] == 'T023']['ngram'].tolist()
    
    #corpus_T023 = [l for l in corpus_T023 if len(l.split(' ')) <=2]
    if (corpus_T023  != []):
    
        X_T023_CV = vectorizer_CV.fit_transform(corpus_T023)
        X_T023_TFIDF = vectorizer_TFIDF.fit_transform(corpus_T023)
        
        max_ngram_T023_CV = vectorizer_CV.get_feature_names()[np.sum(X_T023_CV.toarray(),axis =0).argmax()]
        max_ngram_T023_TFIDF = vectorizer_TFIDF.get_feature_names()[np.sum(X_T023_TFIDF.toarray(),axis =0).argmax()]
        
        df_anatomical_merged= df_anatomical_merged.append({'pat_id':SurgPath_df.iloc[i]['pat_id'], \
                                                               'anatomical site_CV(T023)': max_ngram_T023_CV, \
                                                               'anatomical site_TFIDF(T023)': max_ngram_T023_TFIDF}, ignore_index=True)
            
df_anatomical_merged.shape        

(3388, 3)

# Remove duplicates based on anatomical site_TFIDF(T023)

In [None]:
df_anatomical_merged_noDUP = df_anatomical_merged.drop_duplicates(subset=['pat_id', 'anatomical site_TFIDF(T023)'],keep='first')
df_anatomical_merged_noDUP.reset_index( drop = True)

# Merge anatomical site extraction from SURGpath with main cohort based on patient as the primary key

In [9]:
df_surg_merged_breast = pd.merge(df_anatomical_merged_noDUP , dfs_sitegroups['chrt_BREAST'], on= 'pat_id')
df_surg_merged_lungsnonSM = pd.merge(df_anatomical_merged_noDUP , dfs_sitegroups['chrt_LUNG/BRO'], on= 'pat_id')
df_surg_merged_prostate= pd.merge(df_anatomical_merged_noDUP , dfs_sitegroups['chrt_PROSTATE'], on= 'pat_id')
df_surg_merged_bladder= pd.merge(df_anatomical_merged_noDUP , dfs_sitegroups['chrt_BLADDER'], on= 'pat_id')
df_surg_merged_ovary= pd.merge(df_anatomical_merged_noDUP , dfs_sitegroups['chrt_OVARY'], on= 'pat_id')
df_surg_merged_uterusNOS= pd.merge(df_anatomical_merged_noDUP , dfs_sitegroups['chrt_UTERUS N'], on= 'pat_id')
df_surg_merged_nonHD= pd.merge(df_anatomical_merged_noDUP , dfs_sitegroups['chrt_NON-HODG'], on= 'pat_id')
df_surg_merged_stomach= pd.merge(df_anatomical_merged_noDUP , dfs_sitegroups['chrt_STOMACH'], on= 'pat_id')
df_surg_merged_pancreas= pd.merge(df_anatomical_merged_noDUP , dfs_sitegroups['chrt_PANCREAS'], on= 'pat_id')
df_surg_merged_HD= pd.merge(df_anatomical_merged_noDUP , dfs_sitegroups["chrt_HODGKIN'"], on= 'pat_id')
df_surg_merged_lungsSM= pd.merge(df_anatomical_merged_noDUP , dfs_sitegroups['chrt_LUNG/BRO'], on= 'pat_id')
df_surg_merged_corpusUteri= pd.merge(df_anatomical_merged_noDUP , dfs_sitegroups['chrt_CORPUS U'], on= 'pat_id')


# Compare the ground truth and the outcome of site locations from the surgpath file: breast

In [None]:
df_surg_merged_breast['site_group'] = df_surg_merged_breast['site_group'].apply(lambda x: x.lower())
df_surg_merged_breast['Levenshtein distance'] = np.nan
df_surg_merged_breast['Contains (True/False)'] = np.nan
import nltk
for i in range(len(df_surg_merged_breast.index)):

# #     print((nltk.edit_distance(df_merged.iloc[i]['site_group'],df_merged.iloc[i]['anatomical site_TFIDF'] ))/len(df_merged.iloc[i]['site_group'])  )
#     df_merged.loc[i,'Levenshtein distance']= nltk.edit_distance(df_merged.loc[i,'anatomical site_CV'] ,df_merged.loc[i,'site_group'])/len(df_merged.loc[i,'site_group'])
# #     df_merged['Contains (True/False)'] = 1 if (df_merged.loc[i,'anatomical site_TFIDF'] in df_merged.loc[i,'site_group']) else 0 
    if (df_surg_merged_breast.loc[i,'anatomical site_TFIDF(T023)'] in df_surg_merged_breast.loc[i,'site_group']):
        
        
        df_surg_merged_breast.loc[i,'Contains (True/False)'] = 1
    else:
        df_surg_merged_breast.loc[i,'Contains (True/False)'] = 0
len(df_surg_merged_breast[df_surg_merged_breast['Contains (True/False)']==1])/len(df_surg_merged_breast)

# Compare the ground truth and the outcome of site locations from the surgpath file: lungs non SM

In [None]:
df_surg_merged_lungsnonSM['site_group'] = df_surg_merged_lungsnonSM['site_group'].apply(lambda x: x.lower())
df_surg_merged_lungsnonSM['Levenshtein distance'] = np.nan
df_surg_merged_lungsnonSM['Contains (True/False)'] = np.nan
import nltk
for i in range(len(df_surg_merged_lungsnonSM.index)):

# #     print((nltk.edit_distance(df_merged.iloc[i]['site_group'],df_merged.iloc[i]['anatomical site_TFIDF'] ))/len(df_merged.iloc[i]['site_group'])  )
#     df_merged.loc[i,'Levenshtein distance']= nltk.edit_distance(df_merged.loc[i,'anatomical site_CV'] ,df_merged.loc[i,'site_group'])/len(df_merged.loc[i,'site_group'])
# #     df_merged['Contains (True/False)'] = 1 if (df_merged.loc[i,'anatomical site_TFIDF'] in df_merged.loc[i,'site_group']) else 0 
    if (df_surg_merged_lungsnonSM.loc[i,'anatomical site_TFIDF(T023)'] in df_surg_merged_lungsnonSM.loc[i,'site_group']):
        
        
        df_surg_merged_lungsnonSM.loc[i,'Contains (True/False)'] = 1
    else:
        df_surg_merged_lungsnonSM.loc[i,'Contains (True/False)'] = 0
len(df_surg_merged_lungsnonSM[df_surg_merged_lungsnonSM['Contains (True/False)']==1])/len(df_surg_merged_lungsnonSM)

# Compare the ground truth and the outcome of site locations from the surgpath file: prostate

In [None]:
df_surg_merged_prostate['site_group'] = df_surg_merged_prostate['site_group'].apply(lambda x: x.lower())
df_surg_merged_prostate['Levenshtein distance'] = np.nan
df_surg_merged_prostate['Contains (True/False)'] = np.nan
import nltk
for i in range(len(df_surg_merged_prostate.index)):

# #     print((nltk.edit_distance(df_merged.iloc[i]['site_group'],df_merged.iloc[i]['anatomical site_TFIDF'] ))/len(df_merged.iloc[i]['site_group'])  )
#     df_merged.loc[i,'Levenshtein distance']= nltk.edit_distance(df_merged.loc[i,'anatomical site_CV'] ,df_merged.loc[i,'site_group'])/len(df_merged.loc[i,'site_group'])
# #     df_merged['Contains (True/False)'] = 1 if (df_merged.loc[i,'anatomical site_TFIDF'] in df_merged.loc[i,'site_group']) else 0 
    if (df_surg_merged_prostate.loc[i,'anatomical site_TFIDF(T023)'] in df_surg_merged_prostate.loc[i,'site_group']):
        
        
        df_surg_merged_prostate.loc[i,'Contains (True/False)'] = 1
    else:
        df_surg_merged_prostate.loc[i,'Contains (True/False)'] = 0
len(df_surg_merged_prostate[df_surg_merged_prostate['Contains (True/False)']==1])/len(df_surg_merged_prostate)

# Compare the ground truth and the outcome of site locations from the surgpath file: bladder

In [None]:
df_surg_merged_bladder['site_group'] = df_surg_merged_bladder['site_group'].apply(lambda x: x.lower())
# df_surg_merged_prostate['Levenshtein distance'] = np.nan
df_surg_merged_bladder['Contains (True/False)'] = np.nan
import nltk
for i in range(len(df_surg_merged_bladder.index)):

# #     print((nltk.edit_distance(df_merged.iloc[i]['site_group'],df_merged.iloc[i]['anatomical site_TFIDF'] ))/len(df_merged.iloc[i]['site_group'])  )
#     df_merged.loc[i,'Levenshtein distance']= nltk.edit_distance(df_merged.loc[i,'anatomical site_CV'] ,df_merged.loc[i,'site_group'])/len(df_merged.loc[i,'site_group'])
# #     df_merged['Contains (True/False)'] = 1 if (df_merged.loc[i,'anatomical site_TFIDF'] in df_merged.loc[i,'site_group']) else 0 
    if (df_surg_merged_bladder.loc[i,'anatomical site_TFIDF(T023)'] in df_surg_merged_bladder.loc[i,'site_group']):
        
        
        df_surg_merged_bladder.loc[i,'Contains (True/False)'] = 1
    else:
        df_surg_merged_bladder.loc[i,'Contains (True/False)'] = 0
len(df_surg_merged_bladder[df_surg_merged_bladder['Contains (True/False)']==1])/len(df_surg_merged_bladder)

# Compare the ground truth and the outcome of site locations from the surgpath file: HODGKIN'S disease (HD)

In [None]:
df_surg_merged_HD['site_group'] = df_surg_merged_HD['site_group'].apply(lambda x: x.lower())
# df_surg_merged_prostate['Levenshtein distance'] = np.nan
df_surg_merged_HD['Contains (True/False)'] = np.nan
import nltk
for i in range(len(df_surg_merged_HD.index)):

# #     print((nltk.edit_distance(df_merged.iloc[i]['site_group'],df_merged.iloc[i]['anatomical site_TFIDF'] ))/len(df_merged.iloc[i]['site_group'])  )
#     df_merged.loc[i,'Levenshtein distance']= nltk.edit_distance(df_merged.loc[i,'anatomical site_CV'] ,df_merged.loc[i,'site_group'])/len(df_merged.loc[i,'site_group'])
# #     df_merged['Contains (True/False)'] = 1 if (df_merged.loc[i,'anatomical site_TFIDF'] in df_merged.loc[i,'site_group']) else 0 
    if (df_surg_merged_HD.loc[i,'anatomical site_TFIDF(T023)'] in df_surg_merged_HD.loc[i,'site_group']):
        
        
        df_surg_merged_HD.loc[i,'Contains (True/False)'] = 1
    else:
        df_surg_merged_HD.loc[i,'Contains (True/False)'] = 0
len(df_surg_merged_HD[df_surg_merged_HD['Contains (True/False)']==1])/len(df_surg_merged_HD)

# Compare the ground truth and the outcome of site locations from the surgpath file: CORPUS UTERI

In [None]:
df_surg_merged_corpusUteri['site_group'] = df_surg_merged_corpusUteri['site_group'].apply(lambda x: x.lower())
# df_surg_merged_prostate['Levenshtein distance'] = np.nan
df_surg_merged_corpusUteri['Contains (True/False)'] = np.nan
import nltk
for i in range(len(df_surg_merged_corpusUteri.index)):

# #     print((nltk.edit_distance(df_merged.iloc[i]['site_group'],df_merged.iloc[i]['anatomical site_TFIDF'] ))/len(df_merged.iloc[i]['site_group'])  )
#     df_merged.loc[i,'Levenshtein distance']= nltk.edit_distance(df_merged.loc[i,'anatomical site_CV'] ,df_merged.loc[i,'site_group'])/len(df_merged.loc[i,'site_group'])
# #     df_merged['Contains (True/False)'] = 1 if (df_merged.loc[i,'anatomical site_TFIDF'] in df_merged.loc[i,'site_group']) else 0 
    if (df_surg_merged_corpusUteri.loc[i,'anatomical site_TFIDF(T023)'] in df_surg_merged_corpusUteri.loc[i,'site_group']):
        
        
        df_surg_merged_corpusUteri.loc[i,'Contains (True/False)'] = 1
    else:
        df_surg_merged_corpusUteri.loc[i,'Contains (True/False)'] = 0
len(df_surg_merged_corpusUteri[df_surg_merged_corpusUteri['Contains (True/False)']==1])/len(df_surg_merged_corpusUteri)

# Compare the ground truth and the outcome of site locations from the surgpath file: PANCREAS

In [None]:
df_surg_merged_pancreas['site_group'] = df_surg_merged_pancreas['site_group'].apply(lambda x: x.lower())
# df_surg_merged_prostate['Levenshtein distance'] = np.nan
df_surg_merged_pancreas['Contains (True/False)'] = np.nan
import nltk
for i in range(len(df_surg_merged_pancreas.index)):

# #     print((nltk.edit_distance(df_merged.iloc[i]['site_group'],df_merged.iloc[i]['anatomical site_TFIDF'] ))/len(df_merged.iloc[i]['site_group'])  )
#     df_merged.loc[i,'Levenshtein distance']= nltk.edit_distance(df_merged.loc[i,'anatomical site_CV'] ,df_merged.loc[i,'site_group'])/len(df_merged.loc[i,'site_group'])
# #     df_merged['Contains (True/False)'] = 1 if (df_merged.loc[i,'anatomical site_TFIDF'] in df_merged.loc[i,'site_group']) else 0 
    if (df_surg_merged_pancreas.loc[i,'anatomical site_TFIDF(T023)'] in df_surg_merged_pancreas.loc[i,'site_group']):
        
        
        df_surg_merged_pancreas.loc[i,'Contains (True/False)'] = 1
    else:
        df_surg_merged_pancreas.loc[i,'Contains (True/False)'] = 0
len(df_surg_merged_pancreas[df_surg_merged_pancreas['Contains (True/False)']==1])/len(df_surg_merged_pancreas)

# Compare the ground truth and the outcome of site locations from the surgpath file: STOMACH

In [None]:
df_surg_merged_stomach['site_group'] = df_surg_merged_stomach['site_group'].apply(lambda x: x.lower())
# df_surg_merged_prostate['Levenshtein distance'] = np.nan
df_surg_merged_stomach['Contains (True/False)'] = np.nan
import nltk
for i in range(len(df_surg_merged_stomach.index)):

# #     print((nltk.edit_distance(df_merged.iloc[i]['site_group'],df_merged.iloc[i]['anatomical site_TFIDF'] ))/len(df_merged.iloc[i]['site_group'])  )
#     df_merged.loc[i,'Levenshtein distance']= nltk.edit_distance(df_merged.loc[i,'anatomical site_CV'] ,df_merged.loc[i,'site_group'])/len(df_merged.loc[i,'site_group'])
# #     df_merged['Contains (True/False)'] = 1 if (df_merged.loc[i,'anatomical site_TFIDF'] in df_merged.loc[i,'site_group']) else 0 
    if (df_surg_merged_stomach.loc[i,'anatomical site_TFIDF(T023)'] in df_surg_merged_stomach.loc[i,'site_group']):
        
        
        df_surg_merged_stomach.loc[i,'Contains (True/False)'] = 1
    else:
        df_surg_merged_stomach.loc[i,'Contains (True/False)'] = 0
len(df_surg_merged_stomach[df_surg_merged_stomach['Contains (True/False)']==1])/len(df_surg_merged_stomach)

# Compare the ground truth and the outcome of site locations from the surgpath file: OVARY

In [None]:
df_surg_merged_ovary['site_group'] = df_surg_merged_ovary['site_group'].apply(lambda x: x.lower())
# df_surg_merged_prostate['Levenshtein distance'] = np.nan
df_surg_merged_ovary['Contains (True/False)'] = np.nan
import nltk
for i in range(len(df_surg_merged_ovary.index)):

# #     print((nltk.edit_distance(df_merged.iloc[i]['site_group'],df_merged.iloc[i]['anatomical site_TFIDF'] ))/len(df_merged.iloc[i]['site_group'])  )
#     df_merged.loc[i,'Levenshtein distance']= nltk.edit_distance(df_merged.loc[i,'anatomical site_CV'] ,df_merged.loc[i,'site_group'])/len(df_merged.loc[i,'site_group'])
# #     df_merged['Contains (True/False)'] = 1 if (df_merged.loc[i,'anatomical site_TFIDF'] in df_merged.loc[i,'site_group']) else 0 
    if (df_surg_merged_ovary.loc[i,'anatomical site_TFIDF(T023)'] in df_surg_merged_ovary.loc[i,'site_group']):
        
        
        df_surg_merged_ovary.loc[i,'Contains (True/False)'] = 1
    else:
        df_surg_merged_ovary.loc[i,'Contains (True/False)'] = 0
len(df_surg_merged_ovary[df_surg_merged_ovary['Contains (True/False)']==1])/len(df_surg_merged_ovary)

# Compare the ground truth and the outcome of site locations from the surgpath file: NON-HODGKIN'S LYMPHOMA


In [None]:
df_surg_merged_nonHD['site_group'] = df_surg_merged_nonHD['site_group'].apply(lambda x: x.lower())
# df_surg_merged_prostate['Levenshtein distance'] = np.nan
df_surg_merged_nonHD['Contains (True/False)'] = np.nan
import nltk
for i in range(len(df_surg_merged_nonHD.index)):

# #     print((nltk.edit_distance(df_merged.iloc[i]['site_group'],df_merged.iloc[i]['anatomical site_TFIDF'] ))/len(df_merged.iloc[i]['site_group'])  )
#     df_merged.loc[i,'Levenshtein distance']= nltk.edit_distance(df_merged.loc[i,'anatomical site_CV'] ,df_merged.loc[i,'site_group'])/len(df_merged.loc[i,'site_group'])
# #     df_merged['Contains (True/False)'] = 1 if (df_merged.loc[i,'anatomical site_TFIDF'] in df_merged.loc[i,'site_group']) else 0 
    if (df_surg_merged_nonHD.loc[i,'anatomical site_TFIDF(T023)'] in df_surg_merged_nonHD.loc[i,'site_group']):
        
        
        df_surg_merged_nonHD.loc[i,'Contains (True/False)'] = 1
    else:
        df_surg_merged_nonHD.loc[i,'Contains (True/False)'] = 0
len(df_surg_merged_nonHD[df_surg_merged_nonHD['Contains (True/False)']==1])/len(df_surg_merged_nonHD)

# Compare the ground truth and the outcome of site locations from the surgpath file: LUNG/BRONCHUS-SMALL CELL


In [None]:
df_surg_merged_lungsSM['site_group'] = df_surg_merged_lungsSM['site_group'].apply(lambda x: x.lower())
# df_surg_merged_prostate['Levenshtein distance'] = np.nan
df_surg_merged_lungsSM['Contains (True/False)'] = np.nan
import nltk
for i in range(len(df_surg_merged_lungsSM.index)):

# #     print((nltk.edit_distance(df_merged.iloc[i]['site_group'],df_merged.iloc[i]['anatomical site_TFIDF'] ))/len(df_merged.iloc[i]['site_group'])  )
#     df_merged.loc[i,'Levenshtein distance']= nltk.edit_distance(df_merged.loc[i,'anatomical site_CV'] ,df_merged.loc[i,'site_group'])/len(df_merged.loc[i,'site_group'])
# #     df_merged['Contains (True/False)'] = 1 if (df_merged.loc[i,'anatomical site_TFIDF'] in df_merged.loc[i,'site_group']) else 0 
    if (df_surg_merged_lungsSM.loc[i,'anatomical site_TFIDF(T023)'] in df_surg_merged_lungsSM.loc[i,'site_group']):
        
        
        df_surg_merged_lungsSM.loc[i,'Contains (True/False)'] = 1
    else:
        df_surg_merged_lungsSM.loc[i,'Contains (True/False)'] = 0
len(df_surg_merged_lungsSM[df_surg_merged_lungsSM['Contains (True/False)']==1])/len(df_surg_merged_lungsSM)