In [1]:
import pandas as pd

In [2]:
import numpy as np

# Merging the required data provided in usecase_3, withdrwals.txt and eligibilities.txt


<span style="color: blue;"> We integrated relevant data points extracted from 'usecase.csv', 'eligibilities.txt', and 'withdrawls.txt' files.
Firstly we converted the 'eligibilities.txt' and 'withdrawls.txt' files into either CSV or Excel format for compatibility.</span>

In [14]:
df= pd.read_csv('usecase_3_.csv')

In [None]:
df_2=pd.read_excel('withdrawls.xlsx')

In [None]:
df_4=pd.read_excel('elgb.xlsx')

In [None]:
df_renamed = df_2.rename(columns={'nct_id': 'NCT Number'})

In [None]:
df_renamed.drop(['ctgov_group_code','period'],axis=1,inplace=True)

In [None]:
df_3=df_renamed.drop(['drop_withdraw_comment','reason_comment','count_units'],axis=1)

In [None]:
df_3.drop(['id','result_group_id','reason'],axis=1,inplace=True)

In [None]:
df_grouped = df_renamed.groupby('NCT Number')['reason'].apply(lambda x: ', '.join(x)).reset_index()

df_merged = pd.merge(df_grouped, df_renamed.groupby('NCT Number')['count'].sum().reset_index(), on='NCT Number')

In [None]:
merged_dff = pd.merge(df, df_merged, on='NCT Number', how='left')

In [None]:
df_4 = df_4.rename(columns={'nct_id': 'NCT Number'})

In [None]:
df_4.drop(['id','sampling_method','gender','minimum_age','maximum_age','population','gender_description','gender_based'],axis=1,inplace=True)

In [None]:
final_df = pd.merge(merged_dff, df_4 , on='NCT Number', how='left')

In [None]:
df_lda=final_df.copy()

## We pruned the dataset to include only the essential features

In [None]:
df_lda.drop(['Brief Summary','Acronym','Study URL', 'Unnamed: 0','Unnamed: 0.1'],axis =1,inplace=True )

In [None]:
df_lda.drop(['Primary Completion Date'],axis =1,inplace=True )

In [None]:
df_lda.drop(['Other IDs','First Posted','reason','criteria','Start Date','Completion Date',
       'Results First Posted', 'Last Update Posted', 'Locations',
       'Study Documents','Funder Type','Other Outcome Measures', 'Sponsor',
       'Collaborators','Age','Primary Outcome Measures','Secondary Outcome Measures','Phases'],axis =1,inplace=True )

In [None]:
df_lda.info()

In [None]:
def standardize_date_format(df, date_column):
    """
    Converts date column with mixed formats (mm/dd/yy, yyyy-mm, yyyy-mm-dd) 
    to a single consistent format (yyyy-mm-dd).

    Args:
        df: pandas DataFrame containing the date column.
        date_column: Name of the date column in the DataFrame.

    Returns:
        pandas DataFrame with the date column in the 'yyyy-mm-dd' format.
    """

    df[date_column] = pd.to_datetime(df[date_column], errors='coerce') 
    df[date_column] = df[date_column].dt.strftime('%Y-%m-%d') 

    return df

In [None]:
df_lda = standardize_date_format(df_lda, 'Start Date')

In [None]:
df_lda = standardize_date_format(df_lda, 'Completion Date')

In [None]:
df_lda['Start Date'] = pd.to_datetime(df_lda['Start Date'])
df_lda['Completion Date'] = pd.to_datetime(df_lda['Completion Date'])

df_lda['Duration_Days'] = (df_lda['Completion Date'] - df_lda['Start Date']).dt.days

In [None]:
df_lda.drop(['Start Date','Completion Date'],axis=1,inplace=True)

In [None]:
df=df_lda.copy()

# Minimizing information loss

<span style="color: blue;">To minimize information loss, three of the four selected textual columns (Condition, Interventions, Study Design) were transformed into a set of categorical features. Subsequently, the 'Study Title' column was filtered using these newly created categorical features to effectively extract the objectives of the clinical trials. Study title feature was then preprocessed for word embedding</span>

# For Conditions column

In [None]:
df['Conditions'] = df['Conditions'].fillna('')

In [None]:

df['Conditions'] = df['Conditions'].str.replace('|', ',')

# Split and create a list of new column names
new_column_names = [f'condition_{i+1}' for i in range(df['Conditions'].str.split(',').str.len().max())]

# Split 'Conditions' and create a list of lists
conditions_list = df['Conditions'].str.split(',').tolist()

# Create a new DataFrame with the split values
new_df = pd.DataFrame(conditions_list, columns=new_column_names)

# Concatenate the original DataFrame with the new one
df = pd.concat([df, new_df], axis=1)

# Drop the original 'Conditions' column
df = df.drop('Conditions', axis=1)

In [None]:
df_lda=df.copy()

# Reduce the numbers of conditions column formed according to requirement

In [61]:
start_col = 35
end_col = 175

# Drop columns using integer-based indexing
df_lda = df_lda.drop(df_lda.columns[start_col-1:end_col], axis=1) 

In [66]:
df_lda

Unnamed: 0,NCT Number,Study Title,Study Results,Interventions,Sex,Enrollment,Study Type,Study Design,count,healthy_volunteers,adult,child,older_adult,condition_1,condition_2,condition_3,condition_4,condition_5,condition_6
0,NCT03162666,Patient Outcomes Using an Expandable Spacer,NO,DEVICE: ALTERA,ALL,0.0,OBSERVATIONAL,Observational Model: |Time Perspective: p,,t,t,f,t,Degenerative Disc Disease,,,,,
1,NCT04312048,the Effect of Isosorbide Mononitrate in Reduci...,NO,DRUG: Isosorbide mononitrate|DRUG: placebo,FEMALE,110.0,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,,f,t,t,t,IUD Insertion Pain,,,,,
2,NCT03144778,Durvalumab With or Without Tremelimumab in Tre...,NO,BIOLOGICAL: Durvalumab|BIOLOGICAL: Tremelimumab,ALL,39.0,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,,f,t,f,t,Stage II Oropharyngeal Squamous Cell Carcinoma...,Stage III Oropharyngeal Squamous Cell Carcinom...,Stage IVA Oropharyngeal Squamous Cell Carcinom...,,,
3,NCT01592721,Radiation and Cetuximab Plus Intratumoral EGFR...,YES,BIOLOGICAL: EGFR Antisense DNA,ALL,6.0,INTERVENTIONAL,Allocation: NA|Intervention Model: SINGLE_GROU...,1.0,f,t,f,t,Squamous Cell Carcinoma,Head and Neck Cancer,,,,
4,NCT04253613,Laser Biostimulation in Periodontal Treatment,NO,,MALE,30.0,OBSERVATIONAL,Observational Model: |Time Perspective: p,,t,t,f,t,Periodontal Inflammation,Periodontal Diseases,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64846,NCT02420405,Mutation Detection of EBUS-TBNA Specimens Usin...,NO,PROCEDURE: Routine gene testing|PROCEDURE: Nex...,ALL,78,INTERVENTIONAL,Allocation: NON_RANDOMIZED|Intervention Model:...,,f,t,f,t,Lung Cancer,,,,,
64847,NCT01986998,Study to Compare the Clinical and Radiological...,NO,DRUG: Methylprednisolone 1250 mg/24h x3 days|D...,ALL,49,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,,f,t,f,f,Multiple Sclerosis,,,,,
64848,NCT02631538,Safety and Efficacy Study of Subcutaneous Beli...,YES,DRUG: Belimumab|DRUG: Rituximab|DRUG: Placebo ...,ALL,86,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,26.0,f,t,f,t,Sjogren's Syndrome,,,,,
64849,NCT05301023,Individualized Antibiotic Therapy in Children ...,NO,OTHER: Individualized antibiotic therapy|OTHER...,ALL,408,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,,f,f,t,f,Febrile Urinary Tract Infection,,,,,


In [65]:
start_col = 20
end_col = 46

# Drop columns using integer-based indexing
df_lda = df_lda.drop(df_lda.columns[start_col-1:end_col], axis=1) 

In [67]:
df=df_lda.copy()

# For Interventions column

In [68]:
df['Interventions'] = df['Interventions'].astype(str) 

In [69]:
def extract_device_procedure(text):
    DEVICE=[]
    DRUG=[]
    PROCEDURE=[]
    BIOLOGICAL=[]
    OTHER=[]
    RADIATION=[]
    device = None
    procedure = None
    drug = None
    biological = None
    other = None
    radiation = None
    if isinstance(text, str):
        for item in text.split('|'):
            item = item.strip()
            if item.startswith('DEVICE:'):
                device = item[len('DEVICE:'):]
                DEVICE.append(device)
            elif item.startswith('PROCEDURE:'):
                procedure = item[len('PROCEDURE:'):]
                PROCEDURE.append(procedure)
            elif item.startswith('BIOLOGICAL:'):
                biological = item[len('BIOLOGICAL:'):]
                BIOLOGICAL.append(biological)
            elif item.startswith('OTHER:'):
                other = item[len('OTHER:'):]
                OTHER.append(other)
            elif item.startswith('DRUG:'):
                drug = item[len('DRUG:'):]
                DRUG.append(drug)
            elif item.startswith('RADIATION:'):
                radiation = item[len('RADIATION:'):]
                RADIATION.append(radiation)
    return {'DEVICE': DEVICE, 'PROCEDURE': PROCEDURE,'RADIATION':RADIATION,'DRUG':DRUG,'OTHER':OTHER,'BIOLOGICAL':BIOLOGICAL}

In [70]:
df[['DEVICE', 'PROCEDURE','RADIATION','DRUG','BIOLOGICAL','OTHER']] = df['Interventions'].apply(extract_device_procedure).apply(pd.Series)

In [71]:
df_lda=df.copy()

# For Study Design column

In [72]:
def extract_study_design_components(text):
    """
    Extracts components from the study design string.

    Args:
        text: The input string describing the study design.

    Returns:
        A dictionary containing extracted components.
    """
    components = {}
    if isinstance(text, str):
        parts = text.split('|')
        for part in parts:
            if part.startswith('Allocation:'):
                components['Allocation'] = part.split(':')[1].strip()
            elif part.startswith('Intervention Model:'):
                components['Intervention Model'] = part.split(':')[1].strip()
            elif part.startswith('Masking:'):
                components['Masking'] = part.split(':')[1].strip()
            elif part.startswith('Primary Purpose:'):
                components['Primary Purpose'] = part.split(':')[1].strip()
            elif part.startswith('Observational Model:'):
                components['Observational Model'] = part.split(':')[0].strip()
    return components

In [73]:
df_lda[['Allocation', 'Intervention Model', 'Masking', 'Primary Purpose', 'Observational Model']] = \
    df_lda['Study Design'].apply(extract_study_design_components).apply(pd.Series)

In [74]:
df_lda['Masking'] = df_lda['Masking'].apply(lambda x: str(x)[0:].replace(',', ' '))

In [75]:
df=df_lda.copy()

<span style="color: blue;">'Study Title' column was filtered using these newly created categorical features to effectively extract the objectives of the clinical trials.</span>

In [76]:
df['DEVICE'] = df['DEVICE'].astype(str)
df['PROCEDURE'] = df['PROCEDURE'].astype(str)
df['DRUG'] = df['DRUG'].astype(str)
df['RADIATION'] = df['RADIATION'].astype(str)
df['OTHER'] = df['OTHER'].astype(str)
df['BIOLOGICAL'] = df['BIOLOGICAL'].astype(str)

In [77]:
import re

In [78]:
def remove_symbols(text):
    text = re.sub(r'[^\w\s\d]', '',text)
    return text

In [79]:
df['DEVICE'] = df['DEVICE'].apply(remove_symbols)
df['PROCEDURE'] = df['PROCEDURE'].apply(remove_symbols)
df['DRUG'] = df['DRUG'].apply(remove_symbols)
df['RADIATION'] = df['RADIATION'].apply(remove_symbols)
df['OTHER'] = df['OTHER'].apply(remove_symbols)
df['BIOLOGICAL'] = df['BIOLOGICAL'].apply(remove_symbols)

In [80]:
df['Study Title'] = df['Study Title'].str.lower()
df['condition_1'] = df['condition_1'].str.lower()
df['condition_2'] = df['condition_2'].str.lower()
df['condition_3'] = df['condition_3'].str.lower()
df['condition_4'] = df['condition_4'].str.lower()
df['condition_5'] = df['condition_5'].str.lower()
df['condition_6'] = df['condition_6'].str.lower()


In [81]:
df['DEVICE'] = df['DEVICE'].str.lower()
df['PROCEDURE'] = df['PROCEDURE'].str.lower()
df['DRUG'] = df['DRUG'].str.lower()
df['RADIATION'] = df['RADIATION'].str.lower()
df['OTHER'] = df['OTHER'].str.lower()
df['BIOLOGICAL'] = df['BIOLOGICAL'].str.lower()

In [82]:
import pandas as pd

def remove_shared_words(df, target_col, other_cols):

    def remove_shared_words_from_row(row):
        target_words = set(row[target_col].split())
        other_words = set()
        for col in other_cols:
            other_words.update(row[col].split())
        words_to_keep = target_words - other_words
        return ' '.join(words_to_keep)

    df[target_col] = df.apply(remove_shared_words_from_row, axis=1)
    return df


In [83]:
df["OTHER"]=df["OTHER"].astype(str)

In [89]:
df['Study Title'] = df['Study Title'].fillna('') 

In [90]:
df['DEVICE'] = df['DEVICE'].fillna('') 
df['PROCEDURE'] = df['PROCEDURE'].fillna('') 
df['RADIATION'] = df['RADIATION'].fillna('') 
df['DRUG'] = df['DRUG'].fillna('') 
df['BIOLOGICAL'] = df['BIOLOGICAL'].fillna('') 
df['OTHER'] = df['OTHER'].fillna('')

In [91]:
df['condition_1'] = df['condition_1'].fillna('')
df['condition_2'] = df['condition_2'].fillna('')
df['condition_3'] = df['condition_3'].fillna('')
df['condition_4'] = df['condition_4'].fillna('')
df['condition_5'] = df['condition_5'].fillna('')
df['condition_6'] = df['condition_6'].fillna('')

In [92]:
df = remove_shared_words(df, 'Study Title', ['DEVICE','RADIATION','PROCEDURE','DRUG','BIOLOGICAL','OTHER','condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5','condition_6'])

# WORD EMBEDDING

In [93]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(text) 
    stop_words = set(stopwords.words('english')) 
    filtered_words = [word for word in words if word not in stop_words] 
    return filtered_words 

In [94]:
import pandas as pd
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
import string

In [95]:
df['Study Title'] = df['Study Title'].apply(preprocess_text)

In [96]:
df['Study Title'] = df['Study Title'].apply(lambda x: str(x)[1:-1].replace(',', ' '))

In [97]:
df['Study Title'] = df['Study Title'].apply(remove_symbols)

## Data preprocessing for word embedding

In [98]:
df_lda=df.copy()

In [99]:
df_lda = df_lda.replace('nan', np.nan)

In [100]:
df_lda['Study Title'] = df_lda['Study Title'].astype(str)

In [101]:
df_lda['Allocation']=df_lda['Allocation'].fillna('')
df_lda['Intervention Model']=df_lda['Intervention Model'].fillna('')
df_lda['Masking']=df_lda['Masking'].fillna('')
df_lda['Primary Purpose']=df_lda['Primary Purpose'].fillna('')
df_lda['Observational Model']=df_lda['Observational Model'].fillna('')

In [102]:
df_lda['Allocation'] = df_lda['Allocation'].astype(str)
df_lda['Intervention Model'] = df_lda['Intervention Model'].astype(str)
df_lda['Masking'] = df_lda['Masking'].astype(str)
df_lda['Primary Purpose'] = df_lda['Primary Purpose'].astype(str)
df_lda['Observational Model'] = df_lda['Observational Model'].astype(str)

In [103]:
df_lda['Allocation'] = df_lda['Allocation'].str.lower()
df_lda['Intervention Model'] = df_lda['Intervention Model'].str.lower()
df_lda['Masking'] = df_lda['Masking'].str.lower()
df_lda['Primary Purpose'] = df_lda['Primary Purpose'].str.lower()
df_lda['Observational Model'] = df_lda['Observational Model'].str.lower()

In [104]:
df_lda['Allocation'] = df_lda['Allocation'].str.replace("_", " ")
df_lda['Intervention Model'] = df_lda['Intervention Model'].str.replace("_", " ")
df_lda['Masking'] = df_lda['Masking'].str.replace("_", " ")
df_lda['Primary Purpose'] = df_lda['Primary Purpose'].str.replace("_", " ")
df_lda['Observational Model'] = df_lda['Observational Model'].str.replace("_", " ")

In [105]:
df_lda['DEVICE'] = df_lda['DEVICE'].fillna('') 
df_lda['PROCEDURE'] = df_lda['PROCEDURE'].fillna('') 
df_lda['RADIATION'] = df_lda['RADIATION'].fillna('') 
df_lda['DRUG'] = df_lda['DRUG'].fillna('') 
df_lda['BIOLOGICAL'] = df_lda['BIOLOGICAL'].fillna('') 
df_lda['OTHER'] = df_lda['OTHER'].fillna('') 

In [106]:
df_lda['condition_2'] = df_lda['condition_2'].fillna('')
df_lda['condition_3'] = df_lda['condition_3'].fillna('')
df_lda['condition_4'] = df_lda['condition_4'].fillna('')
df_lda['condition_5'] = df_lda['condition_5'].fillna('')
df_lda['condition_6'] = df_lda['condition_6'].fillna('')

In [107]:
df_lda['condition_2'] = df_lda['condition_2'].astype(str)
df_lda['condition_3'] = df_lda['condition_3'].astype(str)
df_lda['condition_4'] = df_lda['condition_4'].astype(str)
df_lda['condition_5'] = df_lda['condition_5'].astype(str)
df_lda['condition_6'] = df_lda['condition_6'].astype(str)

In [108]:
dfw=df_lda.copy()

In [109]:
df_lda=dfw.copy()

## Getting the Vectors

In [110]:
from gensim.models import KeyedVectors
import pandas as pd

In [111]:
model_path = "bio_embedding_extrinsic"

In [112]:
model = KeyedVectors.load_word2vec_format(model_path, binary=True, encoding='utf-8')

In [113]:
def get_sentence_vector(sentence):
    words = sentence.split()
    word_vectors = [model[word] for word in words if word in model.key_to_index]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [114]:
df_lda['Study Title_vectors'] = df_lda['Study Title'].apply(get_sentence_vector)

In [115]:
dfc=df_lda.copy()

In [116]:
df_lda=dfc.copy()

In [117]:
df_lda['condition_1'] = df_lda['condition_1'].apply(get_sentence_vector)
df_lda['condition_2'] = df_lda['condition_2'].apply(get_sentence_vector)
df_lda['condition_3'] = df_lda['condition_3'].apply(get_sentence_vector)
df_lda['condition_4'] = df_lda['condition_4'].apply(get_sentence_vector)
df_lda['condition_5'] = df_lda['condition_5'].apply(get_sentence_vector)
df_lda['condition_6'] = df_lda['condition_6'].apply(get_sentence_vector)

In [118]:
df_lda['DEVICE'] = df_lda['DEVICE'].apply(get_sentence_vector)
df_lda['PROCEDURE'] = df_lda['PROCEDURE'].apply(get_sentence_vector)
df_lda['RADIATION'] = df_lda['RADIATION'].apply(get_sentence_vector)
df_lda['DRUG'] = df_lda['DRUG'].apply(get_sentence_vector)
df_lda['BIOLOGICAL'] = df_lda['BIOLOGICAL'].apply(get_sentence_vector)
df_lda['OTHER'] = df_lda['OTHER'].apply(get_sentence_vector)

In [119]:
df_lda['Allocation'] = df_lda['Allocation'].apply(get_sentence_vector)
df_lda['Intervention Model'] = df_lda['Intervention Model'].apply(get_sentence_vector)
df_lda['Masking'] = df_lda['Masking'].apply(get_sentence_vector)
df_lda['Primary Purpose'] = df_lda['Primary Purpose'].apply(get_sentence_vector)
df_lda['Observational Model'] = df_lda['Observational Model'].apply(get_sentence_vector)

In [120]:
df_lda['DEVICE'] = df_lda['DEVICE'].apply(np.mean)
df_lda['PROCEDURE'] = df_lda['PROCEDURE'].apply(np.mean)
df_lda['RADIATION'] = df_lda['RADIATION'].apply(np.mean)
df_lda['DRUG'] = df_lda['DRUG'].apply(np.mean)
df_lda['BIOLOGICAL'] = df_lda['BIOLOGICAL'].apply(np.mean)
df_lda['OTHER'] = df_lda['OTHER'].apply(np.mean)

In [121]:
df_lda['Allocation'] = df_lda['Allocation'].apply(np.mean)
df_lda['Intervention Model'] = df_lda['Intervention Model'].apply(np.mean)
df_lda['Masking'] = df_lda['Masking'].apply(np.mean)
df_lda['Primary Purpose'] = df_lda['Primary Purpose'].apply(np.mean)
df_lda['Observational Model'] = df_lda['Observational Model'].apply(np.mean)

In [122]:
df_lda['condition_2'] = df_lda['condition_2'].apply(np.mean)
df_lda['condition_3'] = df_lda['condition_3'].apply(np.mean)
df_lda['condition_4'] = df_lda['condition_4'].apply(np.mean)
df_lda['condition_1'] = df_lda['condition_1'].apply(np.mean)
df_lda['condition_5'] = df_lda['condition_5'].apply(np.mean)
df_lda['condition_6'] = df_lda['condition_6'].apply(np.mean)

In [123]:
df_lda['Study Title_vectors'] = df_lda['Study Title_vectors'].apply(np.mean)

In [154]:
dfz=df_lda.copy()

# Handling the missing values

In [124]:
df_lda.drop(['Interventions'],axis=1,inplace=True)

In [155]:
column_mode = df_lda['Sex'].mode()[0]
df_lda['Sex'].fillna(column_mode, inplace=True)

In [126]:
column_mode = df_lda['child'].mode()[0]
df_lda['child'].fillna(column_mode, inplace=True)

In [127]:
column_mode = df_lda['older_adult'].mode()[0]
df_lda['older_adult'].fillna(column_mode, inplace=True)

In [128]:
column_mode = df_lda['adult'].mode()[0]
df_lda['adult'].fillna(column_mode, inplace=True)

In [129]:
column_mode = df_lda['healthy_volunteers'].mode()[0]
df_lda['healthy_volunteers'].fillna(column_mode, inplace=True)

In [157]:
column_mode = df_lda['Enrollment'].mode()[0]
df_lda['Enrollment'].fillna(column_mode, inplace=True)

In [132]:
df_lda['count'] = df_lda['count'].fillna(0)

In [159]:
column_mode = df_lda['Study Type'].mode()[0]
df_lda['Study Type'].fillna(column_mode, inplace=True)

In [None]:
column_mode = df_lda['Duration_Days'].mode()[0]
df_lda['Duration_Days'].fillna(column_mode, inplace=True)

In [160]:
df_kk=df_lda.copy()

# Encoding several categorical variables in the df_lda DataFrame using a combination of string replacements and dictionary mappings

In [134]:
from sklearn.preprocessing import LabelEncoder

In [135]:
le = LabelEncoder()

In [None]:
df_lda['Study Status'] = df_lda['Study Status'].replace({'TERMINATED': 'NOT COMPLETED', 'WITHDRAWN': 'NOT COMPLETED','SUSPENDED' : 'NOT COMPLETED'})

In [None]:
category_order = {'COMPLETED': 1, 'NOT COMPLETED': 0}

In [136]:
df_lda['Study Status']=df_lda['Study Status'].map(category_order)

KeyError: 'Study Status'

In [137]:
category_order = {'INTERVENTIONAL' :1, 'OBSERVATIONAL':2}

In [138]:
df_lda['Study Type'] = df_lda['Study Type'].map(category_order)

In [139]:
category_order = {'ALL' :1, 'FEMALE':2, 'MALE' :3}

In [140]:
df_lda['Sex'] = df_lda['Sex'].map(category_order)

In [141]:
df_lda['adult'] = le.fit_transform(df_lda['adult'])

In [142]:
df_lda['older_adult'] = le.fit_transform(df_lda['older_adult'])

In [143]:
df_lda['child'] = le.fit_transform(df_lda['child'])

In [144]:
df_lda['healthy_volunteers'] = le.fit_transform(df_lda['healthy_volunteers'])

In [145]:
df_c=df_lda.copy()

In [146]:
df_lda=df_c.copy()

In [152]:
df_lda['Enrollment'] = pd.to_numeric(df_lda['Enrollment'], errors='coerce')

In [167]:
df_lda.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64851 entries, 0 to 64850
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Sex                  64851 non-null  float64
 1   Study Type           64851 non-null  float64
 2   healthy_volunteers   64851 non-null  int32  
 3   adult                64851 non-null  int32  
 4   child                64851 non-null  int32  
 5   older_adult          64851 non-null  int32  
 6   condition_1          64851 non-null  float64
 7   condition_2          64851 non-null  float64
 8   condition_3          64851 non-null  float64
 9   condition_4          64851 non-null  float64
 10  condition_5          64851 non-null  float64
 11  condition_6          64851 non-null  float64
 12  DEVICE               64851 non-null  float64
 13  PROCEDURE            64851 non-null  float64
 14  RADIATION            64851 non-null  float64
 15  DRUG                 64851 non-null 

In [162]:
df_lda['participants'] = df_lda['Enrollment'] - df_lda['count']

In [163]:
df_lda['participants'] = (df_lda['participants'] >= 10).astype(int)

In [164]:
df_lda.drop(['NCT Number','Study Results'],axis=1,inplace=True)

KeyError: "['NCT Number', 'Study Results'] not found in axis"

In [166]:
df_lda.drop(['Study Title','Study Design','Enrollment','count'],axis=1,inplace=True)

In [168]:
df_lda

Unnamed: 0,Sex,Study Type,healthy_volunteers,adult,child,older_adult,condition_1,condition_2,condition_3,condition_4,...,DRUG,BIOLOGICAL,OTHER,Allocation,Intervention Model,Masking,Primary Purpose,Observational Model,Study Title_vectors,participants
0,1.0,2.0,1,1,0,1,-0.023264,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,-0.033501,0.000000,0.000000,0.000000,0.000000,-0.017444,0
1,2.0,1.0,0,1,1,1,-0.025159,0.000000,0.000000,0.0,...,-0.024208,0.000000,0.000000,0.000000,-0.035152,-0.009766,-0.008161,-0.018044,-0.025835,1
2,1.0,1.0,0,1,0,1,-0.017094,-0.014865,-0.014493,0.0,...,0.000000,0.000000,-0.027875,0.000000,-0.035152,-0.009766,-0.016295,-0.020067,-0.017100,1
3,1.0,1.0,0,1,0,1,-0.035565,-0.025623,0.000000,0.0,...,0.000000,0.000000,-0.008715,0.000000,-0.026669,-0.011547,-0.016295,-0.020067,-0.024387,0
4,3.0,2.0,1,1,0,1,-0.032258,-0.017410,0.000000,0.0,...,0.000000,0.000000,0.000000,-0.033501,0.000000,0.000000,0.000000,0.000000,-0.025473,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64846,1.0,1.0,0,1,0,1,-0.023055,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,-0.031133,-0.011547,-0.016295,0.012681,-0.007872,1
64847,1.0,1.0,0,1,0,0,-0.015890,0.000000,0.000000,0.0,...,-0.019946,0.000000,0.000000,0.000000,-0.035152,-0.009766,-0.008161,-0.020067,-0.025712,1
64848,1.0,1.0,0,1,0,1,-0.004175,0.000000,0.000000,0.0,...,-0.009395,0.000000,0.000000,0.000000,-0.035152,-0.009766,-0.006947,-0.020067,-0.035918,1
64849,1.0,1.0,0,0,1,0,-0.014980,0.000000,0.000000,0.0,...,0.000000,-0.012909,0.000000,0.000000,-0.035152,-0.009766,-0.016295,-0.020067,-0.023179,1


## Splitting the dataset for model building(training & testing) and validation set as unseen data

In [169]:
completed_df = df_lda[df_lda['Study Status'] == 1]

KeyError: 'Study Status'

In [435]:
not_completed_df = df_lda[df_lda['Study Status'] == 0]

In [436]:
notc=not_completed_df.iloc[0:25000,0:27]

In [437]:
c=completed_df.iloc[0:175000,0:27]

In [438]:
df_trial = pd.concat([notc, c], ignore_index=True)

In [439]:
df_trial = df_trial.sample(frac=1, random_state=42)

In [440]:
notc=not_completed_df.iloc[25000:36335,0:27]

In [441]:
c=completed_df.iloc[200000:221244,0:27]

In [442]:
df_test = pd.concat([notc, c], ignore_index=True)

In [443]:
df_test = df_test.sample(frac=1, random_state=42)

## Machine Learning Model Development on 200k Imbalanced Dataset (SMOTE)

In [170]:
from imblearn.over_sampling import SMOTE
from collections import Counter

In [445]:
X= df_trial.drop('Study Status',axis = 1)
y = df_trial['Study Status']

In [446]:
print("Original dataset shape %s" % Counter(y))

smote = SMOTE(sampling_strategy='minority') 
X_smote, y_smote = smote.fit_resample(X, y)

print("Resampled dataset shape %s" % Counter(y_smote))

Original dataset shape Counter({1: 175000, 0: 25000})
Resampled dataset shape Counter({1: 175000, 0: 175000})


In [447]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_smote,y_smote,test_size=0.3,random_state=1)

In [450]:
from sklearn import metrics

<span style="color: blue; ">Gradient Boosting Classifiers (GBC), is chosen due to its ability to handle complex relationships within data. GBC excels in accuracy, flexibility, and robustness. Its inherent mechanisms like early stopping and regularization prevent overfitting, ensuring reliable predictions on unseen data. Moreover, GBC's adaptability to different loss functions allows for customization to specific classification needs."</span>

In [448]:
from sklearn.ensemble import GradientBoostingClassifier
GB_model=GradientBoostingClassifier(n_estimators=100,random_state=1)
GB_model.fit(X_train,y_train)

In [451]:
y_train_predict = GB_model.predict(X_train)
model_score = GB_model.score(X_train,y_train)
print(model_score)
print()
print(metrics.confusion_matrix (y_train,y_train_predict) )
print(metrics.classification_report(y_train,y_train_predict))

0.9050938775510204

[[105748  16818]
 [  6434 116000]]
              precision    recall  f1-score   support

           0       0.94      0.86      0.90    122566
           1       0.87      0.95      0.91    122434

    accuracy                           0.91    245000
   macro avg       0.91      0.91      0.90    245000
weighted avg       0.91      0.91      0.90    245000



In [452]:
y_test_predict =GB_model.predict(X_test)
model_score = GB_model.score(X_test,y_test)
print(model_score)
print()
print(metrics.confusion_matrix (y_test,y_test_predict) )
print(metrics.classification_report(y_test,y_test_predict))

0.9039904761904762

[[45171  7263]
 [ 2818 49748]]
              precision    recall  f1-score   support

           0       0.94      0.86      0.90     52434
           1       0.87      0.95      0.91     52566

    accuracy                           0.90    105000
   macro avg       0.91      0.90      0.90    105000
weighted avg       0.91      0.90      0.90    105000



# Hyperparameter Optimization

In [455]:
from sklearn.model_selection import GridSearchCV

In [456]:
param_grid = {
    'n_estimators': [50, 100,150, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7,9],
    'min_samples_split': [2, 5, 10,15],
    'min_samples_leaf': [1, 2, 4,6]
}

In [457]:
grid_search = GridSearchCV(estimator=GB_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [None]:
y_train_predict = best_model.predict(X_train)
model_score = best_model.score(X_train,y_train)
print(model_score)
print()
print(metrics.confusion_matrix (y_train,y_train_predict) )
print(metrics.classification_report(y_train,y_train_predict))

In [None]:
y_test_predict =best_model.predict(X_test)
model_score = best_model.score(X_test,y_test)
print(model_score)
print()
print(metrics.confusion_matrix (y_test,y_test_predict) )
print(metrics.classification_report(y_test,y_test_predict))

# Evaluating Model Performance on a Hold-out Set of 55k Unseen Data Points

In [None]:
df_t=df_test.drop(['Study Status'],axis=1)

In [None]:
dy = df_test['Study Status']

In [None]:
new_predictions = best_model.predict(df_t) 

In [None]:
model_score = best_model.score(df_t,dy)
print(model_score)
print()
print(metrics.confusion_matrix (dy,new_predictions) )
print(metrics.classification_report(dy,new_predictions))

In [None]:
import joblib

In [None]:
from joblib import dump

In [None]:
joblib.dump(best_model, 'gradient_boosting_model.joblib') 

In [None]:
best_model = joblib.load('gradient_boosting_model.joblib')

In [1]:
import pandas as pd

In [3]:
df_11=pd.read_csv('usecase_3_TEST_gt_removed.csv')

  df_11=pd.read_csv('usecase_3_TEST_gt_removed.csv')


In [4]:
start_col =31
end_col = 3700

# Drop columns using integer-based indexing
df_11 = df_11.drop(df_11.columns[start_col-1:end_col], axis=1)

In [5]:
df_11.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64851 entries, 0 to 64850
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Unnamed: 0                  64681 non-null  object
 1   NCT Number                  64578 non-null  object
 2   Study Title                 64540 non-null  object
 3   Study URL                   64539 non-null  object
 4   Acronym                     15949 non-null  object
 5   Brief Summary               64520 non-null  object
 6   Study Results               64516 non-null  object
 7   Conditions                  64501 non-null  object
 8   Interventions               58562 non-null  object
 9   Primary Outcome Measures    61881 non-null  object
 10  Secondary Outcome Measures  46432 non-null  object
 11  Other Outcome Measures      4635 non-null   object
 12  Sponsor                     64441 non-null  object
 13  Collaborators               20917 non-null  ob

In [6]:
df=df_11.copy()

In [172]:
from joblib import load

In [173]:
loaded_model = load('gradient_boosting_model_2.joblib')

In [174]:
loaded_model

In [176]:
X_test=df_lda.copy()

In [177]:
predictions = loaded_model.predict(X_test) 

# Add the predictions to the test data
df_lda['Predictions'] = predictions


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Duration_Days
- Study Design
