In [2]:
import pandas as pd

In [151]:
import numpy as np

# Merging the required data provided in usecase_3, withdrwals.txt and eligibilities.txt


<span style="color: blue;"> We integrated relevant data points extracted from 'usecase.csv', 'eligibilities.txt', and 'withdrawls.txt' files.
Firstly we converted the 'eligibilities.txt' and 'withdrawls.txt' files into either CSV or Excel format for compatibility.</span>

In [287]:
df= pd.read_csv('usecase_3_.csv')

In [4]:
df_2=pd.read_excel('withdrawls.xlsx')

In [5]:
df_4=pd.read_excel('elgb.xlsx')

In [6]:
df_renamed = df_2.rename(columns={'nct_id': 'NCT Number'})

In [7]:
df_renamed.drop(['ctgov_group_code','period'],axis=1,inplace=True)

In [8]:
df_3=df_renamed.drop(['drop_withdraw_comment','reason_comment','count_units'],axis=1)

In [9]:
df_3.drop(['id','result_group_id','reason'],axis=1,inplace=True)

In [10]:
df_grouped = df_renamed.groupby('NCT Number')['reason'].apply(lambda x: ', '.join(x)).reset_index()

df_merged = pd.merge(df_grouped, df_renamed.groupby('NCT Number')['count'].sum().reset_index(), on='NCT Number')

In [288]:
merged_dff = pd.merge(df, df_merged, on='NCT Number', how='left')

In [268]:
df_4 = df_4.rename(columns={'nct_id': 'NCT Number'})

In [None]:
df_4.drop(['id','sampling_method','gender','minimum_age','maximum_age','population','gender_description','gender_based'],axis=1,inplace=True)

In [289]:
final_df = pd.merge(merged_dff, df_4 , on='NCT Number', how='left')

In [290]:
df_lda=final_df.copy()

## We pruned the dataset to include only the essential features

In [291]:
df_lda.drop(['Brief Summary','Acronym','Study URL', 'Unnamed: 0', 'Unnamed: 0.1'],axis =1,inplace=True )

In [292]:
df_lda.drop(['Primary Completion Date'],axis =1,inplace=True )

In [293]:
df_lda.drop(['Other IDs','First Posted','reason','criteria',
       'Results First Posted', 'Last Update Posted', 'Locations',
       'Study Documents','Funder Type','Other Outcome Measures', 'Sponsor',
       'Collaborators','Age','Primary Outcome Measures','Secondary Outcome Measures','Phases'],axis =1,inplace=True )

In [294]:
df_lda.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257577 entries, 0 to 257576
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   NCT Number          257577 non-null  object 
 1   Study Title         257577 non-null  object 
 2   Study Status        257577 non-null  object 
 3   Study Results       257577 non-null  object 
 4   Conditions          257577 non-null  object 
 5   Interventions       234064 non-null  object 
 6   Sex                 257317 non-null  object 
 7   Enrollment          254205 non-null  float64
 8   Study Type          257577 non-null  object 
 9   Study Design        257577 non-null  object 
 10  Start Date          255245 non-null  object 
 11  Completion Date     249686 non-null  object 
 12  count               31108 non-null   float64
 13  healthy_volunteers  252640 non-null  object 
 14  adult               257154 non-null  object 
 15  child               257154 non-nul

In [295]:
def convert_date_format(date_str):
    try:
        if len(str(date_str)) == 7:
            return f'{str(date_str)}-01' 
        else:
            return str(date_str)
    except (TypeError, ValueError):
        return None  

In [296]:
df_lda['Completion Date'] =df_lda['Completion Date'].apply(convert_date_format)

In [297]:
df_lda['Start Date'] =df_lda['Start Date'].apply(convert_date_format)

In [298]:
df_lda['Start Date'] = pd.to_datetime(df_lda['Start Date'])
df_lda['Completion Date'] = pd.to_datetime(df_lda['Completion Date'])

df_lda['Duration_Days'] = (df_lda['Completion Date'] - df_lda['Start Date']).dt.days

In [299]:
df_lda.drop(['Start Date','Completion Date'],axis=1,inplace=True)

In [300]:
df=df_lda.copy()

# Minimizing information loss

<span style="color: blue;">To minimize information loss, three of the four selected textual columns (Condition, Interventions, Study Design) were transformed into a set of categorical features. Subsequently, the 'Study Title' column was filtered using these newly created categorical features to effectively extract the objectives of the clinical trials. Study title feature was then preprocessed for word embedding</span>

# For Conditions column

In [302]:

df['Conditions'] = df['Conditions'].str.replace('|', ',')

# Split and create a list of new column names
new_column_names = [f'condition_{i+1}' for i in range(df['Conditions'].str.split(',').str.len().max())]

# Split 'Conditions' and create a list of lists
conditions_list = df['Conditions'].str.split(',').tolist()

# Create a new DataFrame with the split values
new_df = pd.DataFrame(conditions_list, columns=new_column_names)

# Concatenate the original DataFrame with the new one
df = pd.concat([df, new_df], axis=1)

# Drop the original 'Conditions' column
df = df.drop('Conditions', axis=1)

In [303]:
df_lda=df.copy()

In [304]:
start_col = 35
end_col = 175

# Drop columns using integer-based indexing
df_lda = df_lda.drop(df_lda.columns[start_col-1:end_col], axis=1) 

In [305]:
start_col = 22
end_col = 57

# Drop columns using integer-based indexing
df_lda = df_lda.drop(df_lda.columns[start_col-1:end_col], axis=1) 

In [307]:
df=df_lda.copy()

# For Interventions column

In [308]:
df['Interventions'] = df['Interventions'].astype(str) 

In [309]:
def extract_device_procedure(text):
    DEVICE=[]
    DRUG=[]
    PROCEDURE=[]
    BIOLOGICAL=[]
    OTHER=[]
    RADIATION=[]
    device = None
    procedure = None
    drug = None
    biological = None
    other = None
    radiation = None
    if isinstance(text, str):
        for item in text.split('|'):
            item = item.strip()
            if item.startswith('DEVICE:'):
                device = item[len('DEVICE:'):]
                DEVICE.append(device)
            elif item.startswith('PROCEDURE:'):
                procedure = item[len('PROCEDURE:'):]
                PROCEDURE.append(procedure)
            elif item.startswith('BIOLOGICAL:'):
                biological = item[len('BIOLOGICAL:'):]
                BIOLOGICAL.append(biological)
            elif item.startswith('OTHER:'):
                other = item[len('OTHER:'):]
                OTHER.append(other)
            elif item.startswith('DRUG:'):
                drug = item[len('DRUG:'):]
                DRUG.append(drug)
            elif item.startswith('RADIATION:'):
                radiation = item[len('RADIATION:'):]
                RADIATION.append(radiation)
    return {'DEVICE': DEVICE, 'PROCEDURE': PROCEDURE,'RADIATION':RADIATION,'DRUG':DRUG,'OTHER':OTHER,'BIOLOGICAL':BIOLOGICAL}

In [310]:
df[['DEVICE', 'PROCEDURE','RADIATION','DRUG','BIOLOGICAL','OTHER']] = df['Interventions'].apply(extract_device_procedure).apply(pd.Series)

In [312]:
df_lda=df.copy()

# For Study Design column

In [313]:
def extract_study_design_components(text):
    """
    Extracts components from the study design string.

    Args:
        text: The input string describing the study design.

    Returns:
        A dictionary containing extracted components.
    """
    components = {}
    if isinstance(text, str):
        parts = text.split('|')
        for part in parts:
            if part.startswith('Allocation:'):
                components['Allocation'] = part.split(':')[1].strip()
            elif part.startswith('Intervention Model:'):
                components['Intervention Model'] = part.split(':')[1].strip()
            elif part.startswith('Masking:'):
                components['Masking'] = part.split(':')[1].strip()
            elif part.startswith('Primary Purpose:'):
                components['Primary Purpose'] = part.split(':')[1].strip()
            elif part.startswith('Observational Model:'):
                components['Observational Model'] = part.split(':')[0].strip()
    return components

In [314]:
df_lda[['Allocation', 'Intervention Model', 'Masking', 'Primary Purpose', 'Observational Model']] = \
    df_lda['Study Design'].apply(extract_study_design_components).apply(pd.Series)

In [315]:
df_lda['Masking'] = df_lda['Masking'].apply(lambda x: str(x)[0:].replace(',', ' '))

In [317]:
df=df_lda.copy()

<span style="color: blue;">'Study Title' column was filtered using these newly created categorical features to effectively extract the objectives of the clinical trials.</span>

In [318]:
df['DEVICE'] = df['DEVICE'].astype(str)
df['PROCEDURE'] = df['PROCEDURE'].astype(str)
df['DRUG'] = df['DRUG'].astype(str)
df['RADIATION'] = df['RADIATION'].astype(str)
df['OTHER'] = df['OTHER'].astype(str)
df['BIOLOGICAL'] = df['BIOLOGICAL'].astype(str)

In [319]:
import re

In [320]:
def remove_symbols(text):
    text = re.sub(r'[^\w\s\d]', '',text)
    return text

In [321]:
df['DEVICE'] = df['DEVICE'].apply(remove_symbols)
df['PROCEDURE'] = df['PROCEDURE'].apply(remove_symbols)
df['DRUG'] = df['DRUG'].apply(remove_symbols)
df['RADIATION'] = df['RADIATION'].apply(remove_symbols)
df['OTHER'] = df['OTHER'].apply(remove_symbols)
df['BIOLOGICAL'] = df['BIOLOGICAL'].apply(remove_symbols)

In [322]:
df['Study Title'] = df['Study Title'].str.lower()
df['condition_1'] = df['condition_1'].str.lower()
df['condition_2'] = df['condition_2'].str.lower()
df['condition_3'] = df['condition_3'].str.lower()
df['condition_4'] = df['condition_4'].str.lower()
df['condition_5'] = df['condition_5'].str.lower()
df['condition_6'] = df['condition_6'].str.lower()


In [323]:
df['DEVICE'] = df['DEVICE'].str.lower()
df['PROCEDURE'] = df['PROCEDURE'].str.lower()
df['DRUG'] = df['DRUG'].str.lower()
df['RADIATION'] = df['RADIATION'].str.lower()
df['OTHER'] = df['OTHER'].str.lower()
df['BIOLOGICAL'] = df['BIOLOGICAL'].str.lower()

In [324]:
import pandas as pd

def remove_shared_words(df, target_col, other_cols):

    def remove_shared_words_from_row(row):
        target_words = set(row[target_col].split())
        other_words = set()
        for col in other_cols:
            other_words.update(row[col].split())
        words_to_keep = target_words - other_words
        return ' '.join(words_to_keep)

    df[target_col] = df.apply(remove_shared_words_from_row, axis=1)
    return df


In [325]:
df["OTHER"]=df["OTHER"].astype(str)

In [326]:
df['condition_2'] = df['condition_2'].fillna('')
df['condition_3'] = df['condition_3'].fillna('')
df['condition_4'] = df['condition_4'].fillna('')
df['condition_5'] = df['condition_5'].fillna('')
df['condition_6'] = df['condition_6'].fillna('')

In [327]:
df = remove_shared_words(df, 'Study Title', ['DEVICE','RADIATION','PROCEDURE','DRUG','BIOLOGICAL','OTHER','condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5','condition_6'])

# WORD EMBEDDING

In [328]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(text) 
    stop_words = set(stopwords.words('english')) 
    filtered_words = [word for word in words if word not in stop_words] 
    return filtered_words 

In [329]:
import pandas as pd
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
import string

In [330]:
df['Study Title'] = df['Study Title'].fillna('')

In [331]:
df['Study Title'] = df['Study Title'].apply(preprocess_text)

In [332]:
df['Study Title'] = df['Study Title'].apply(lambda x: str(x)[1:-1].replace(',', ' '))

In [333]:
df['Study Title'] = df['Study Title'].apply(remove_symbols)

## Data preprocessing for word embedding

In [334]:
df_lda=df.copy()

In [335]:
df_lda = df_lda.replace('nan', np.nan)

In [336]:
df_lda['Study Title'] = df_lda['Study Title'].astype(str)

In [337]:
df_lda['Allocation']=df_lda['Allocation'].fillna('')
df_lda['Intervention Model']=df_lda['Intervention Model'].fillna('')
df_lda['Masking']=df_lda['Masking'].fillna('')
df_lda['Primary Purpose']=df_lda['Primary Purpose'].fillna('')
df_lda['Observational Model']=df_lda['Observational Model'].fillna('')

In [338]:
df_lda['Allocation'] = df_lda['Allocation'].astype(str)
df_lda['Intervention Model'] = df_lda['Intervention Model'].astype(str)
df_lda['Masking'] = df_lda['Masking'].astype(str)
df_lda['Primary Purpose'] = df_lda['Primary Purpose'].astype(str)
df_lda['Observational Model'] = df_lda['Observational Model'].astype(str)

In [339]:
df_lda['Allocation'] = df_lda['Allocation'].str.lower()
df_lda['Intervention Model'] = df_lda['Intervention Model'].str.lower()
df_lda['Masking'] = df_lda['Masking'].str.lower()
df_lda['Primary Purpose'] = df_lda['Primary Purpose'].str.lower()
df_lda['Observational Model'] = df_lda['Observational Model'].str.lower()

In [340]:
df_lda['Allocation'] = df_lda['Allocation'].str.replace("_", " ")
df_lda['Intervention Model'] = df_lda['Intervention Model'].str.replace("_", " ")
df_lda['Masking'] = df_lda['Masking'].str.replace("_", " ")
df_lda['Primary Purpose'] = df_lda['Primary Purpose'].str.replace("_", " ")
df_lda['Observational Model'] = df_lda['Observational Model'].str.replace("_", " ")

In [341]:
df_lda['DEVICE'] = df_lda['DEVICE'].fillna('') 
df_lda['PROCEDURE'] = df_lda['PROCEDURE'].fillna('') 
df_lda['RADIATION'] = df_lda['RADIATION'].fillna('') 
df_lda['DRUG'] = df_lda['DRUG'].fillna('') 
df_lda['BIOLOGICAL'] = df_lda['BIOLOGICAL'].fillna('') 
df_lda['OTHER'] = df_lda['OTHER'].fillna('') 

In [342]:
df_lda['condition_2'] = df_lda['condition_2'].fillna('')
df_lda['condition_3'] = df_lda['condition_3'].fillna('')
df_lda['condition_4'] = df_lda['condition_4'].fillna('')
df_lda['condition_5'] = df_lda['condition_5'].fillna('')
df_lda['condition_6'] = df_lda['condition_6'].fillna('')

In [343]:
df_lda['condition_2'] = df_lda['condition_2'].astype(str)
df_lda['condition_3'] = df_lda['condition_3'].astype(str)
df_lda['condition_4'] = df_lda['condition_4'].astype(str)
df_lda['condition_5'] = df_lda['condition_5'].astype(str)
df_lda['condition_6'] = df_lda['condition_6'].astype(str)

In [344]:
dfw=df_lda.copy()

In [394]:
df_lda=dfw.copy()

## Getting the Vectors

In [345]:
from gensim.models import KeyedVectors
import pandas as pd

In [346]:
model_path = "bio_embedding_extrinsic"

In [347]:
model = KeyedVectors.load_word2vec_format(model_path, binary=True, encoding='utf-8')

In [348]:
def get_sentence_vector(sentence):
    words = sentence.split()
    word_vectors = [model[word] for word in words if word in model.key_to_index]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [349]:
df_lda['Study Title_vectors'] = df_lda['Study Title'].apply(get_sentence_vector)

In [350]:
dfc=df_lda.copy()

In [351]:
df_lda=dfc.copy()

In [352]:
df_lda['condition_1'] = df_lda['condition_1'].apply(get_sentence_vector)
df_lda['condition_2'] = df_lda['condition_2'].apply(get_sentence_vector)
df_lda['condition_3'] = df_lda['condition_3'].apply(get_sentence_vector)
df_lda['condition_4'] = df_lda['condition_4'].apply(get_sentence_vector)
df_lda['condition_5'] = df_lda['condition_5'].apply(get_sentence_vector)
df_lda['condition_6'] = df_lda['condition_6'].apply(get_sentence_vector)

In [353]:
df_lda['DEVICE'] = df_lda['DEVICE'].apply(get_sentence_vector)
df_lda['PROCEDURE'] = df_lda['PROCEDURE'].apply(get_sentence_vector)
df_lda['RADIATION'] = df_lda['RADIATION'].apply(get_sentence_vector)
df_lda['DRUG'] = df_lda['DRUG'].apply(get_sentence_vector)
df_lda['BIOLOGICAL'] = df_lda['BIOLOGICAL'].apply(get_sentence_vector)
df_lda['OTHER'] = df_lda['OTHER'].apply(get_sentence_vector)

In [354]:
df_lda['Allocation'] = df_lda['Allocation'].apply(get_sentence_vector)
df_lda['Intervention Model'] = df_lda['Intervention Model'].apply(get_sentence_vector)
df_lda['Masking'] = df_lda['Masking'].apply(get_sentence_vector)
df_lda['Primary Purpose'] = df_lda['Primary Purpose'].apply(get_sentence_vector)
df_lda['Observational Model'] = df_lda['Observational Model'].apply(get_sentence_vector)

In [355]:
df_lda['DEVICE'] = df_lda['DEVICE'].apply(np.mean)
df_lda['PROCEDURE'] = df_lda['PROCEDURE'].apply(np.mean)
df_lda['RADIATION'] = df_lda['RADIATION'].apply(np.mean)
df_lda['DRUG'] = df_lda['DRUG'].apply(np.mean)
df_lda['BIOLOGICAL'] = df_lda['BIOLOGICAL'].apply(np.mean)
df_lda['OTHER'] = df_lda['OTHER'].apply(np.mean)

In [356]:
df_lda['Allocation'] = df_lda['Allocation'].apply(np.mean)
df_lda['Intervention Model'] = df_lda['Intervention Model'].apply(np.mean)
df_lda['Masking'] = df_lda['Masking'].apply(np.mean)
df_lda['Primary Purpose'] = df_lda['Primary Purpose'].apply(np.mean)
df_lda['Observational Model'] = df_lda['Observational Model'].apply(np.mean)

In [357]:
df_lda['condition_2'] = df_lda['condition_2'].apply(np.mean)
df_lda['condition_3'] = df_lda['condition_3'].apply(np.mean)
df_lda['condition_4'] = df_lda['condition_4'].apply(np.mean)
df_lda['condition_1'] = df_lda['condition_1'].apply(np.mean)
df_lda['condition_5'] = df_lda['condition_5'].apply(np.mean)
df_lda['condition_6'] = df_lda['condition_6'].apply(np.mean)

In [358]:
df_lda['Study Title_vectors'] = df_lda['Study Title_vectors'].apply(np.mean)

# Handling the missing values

In [395]:
df_lda.drop(['Interventions'],axis=1,inplace=True)

In [396]:
column_mode = df_lda['Sex'].mode()[0]
df_lda['Sex'].fillna(column_mode, inplace=True)

In [397]:
column_mode = df_lda['child'].mode()[0]
df_lda['child'].fillna(column_mode, inplace=True)

In [398]:
column_mode = df_lda['older_adult'].mode()[0]
df_lda['older_adult'].fillna(column_mode, inplace=True)

In [399]:
column_mode = df_lda['adult'].mode()[0]
df_lda['adult'].fillna(column_mode, inplace=True)

In [400]:
column_mode = df_lda['healthy_volunteers'].mode()[0]
df_lda['healthy_volunteers'].fillna(column_mode, inplace=True)

In [401]:
column_mode = df_lda['Duration_Days'].mode()[0]
df_lda['Duration_Days'].fillna(column_mode, inplace=True)

In [402]:
column_mode = df_lda['Enrollment'].mode()[0]
df_lda['Enrollment'].fillna(column_mode, inplace=True)

In [403]:
df_lda['count'] = df_lda['count'].fillna(0)

In [393]:
#df_kk=df_lda.copy()

# Encoding several categorical variables in the df_lda DataFrame using a combination of string replacements and dictionary mappings

In [405]:
from sklearn.preprocessing import LabelEncoder

In [406]:
le = LabelEncoder()

In [407]:
df_lda['Study Status'] = df_lda['Study Status'].replace({'TERMINATED': 'NOT COMPLETED', 'WITHDRAWN': 'NOT COMPLETED','SUSPENDED' : 'NOT COMPLETED'})

In [408]:
category_order = {'COMPLETED': 1, 'NOT COMPLETED': 0}

In [409]:
df_lda['Study Status']=df_lda['Study Status'].map(category_order)

In [410]:
category_order = {'INTERVENTIONAL' :1, 'OBSERVATIONAL':2}

In [411]:
df_lda['Study Type'] = df_lda['Study Type'].map(category_order)

In [412]:
category_order = {'ALL' :1, 'FEMALE':2, 'MALE' :3}

In [413]:
df_lda['Sex'] = df_lda['Sex'].map(category_order)

In [414]:
df_lda['adult'] = le.fit_transform(df_lda['adult'])

In [415]:
df_lda['older_adult'] = le.fit_transform(df_lda['older_adult'])

In [416]:
df_lda['child'] = le.fit_transform(df_lda['child'])

In [417]:
df_lda['healthy_volunteers'] = le.fit_transform(df_lda['healthy_volunteers'])

In [383]:
df_c=df_lda.copy()

In [425]:
df_lda=df_c.copy()

In [426]:
df_lda['participants'] = df_lda['Enrollment'] - df_lda['count']

In [427]:
df_lda['participants'] = (df_lda['participants'] >= 10).astype(int)

In [428]:
df_lda.drop(['NCT Number','Study Results'],axis=1,inplace=True)

In [429]:
df_lda.drop(['Study Title','Study Design','Enrollment','count'],axis=1,inplace=True)

## Splitting the dataset for model building(training & testing) and validation set as unseen data

In [434]:
completed_df = df_lda[df_lda['Study Status'] == 1]

In [435]:
not_completed_df = df_lda[df_lda['Study Status'] == 0]

In [436]:
notc=not_completed_df.iloc[0:25000,0:27]

In [437]:
c=completed_df.iloc[0:175000,0:27]

In [438]:
df_trial = pd.concat([notc, c], ignore_index=True)

In [439]:
df_trial = df_trial.sample(frac=1, random_state=42)

In [440]:
notc=not_completed_df.iloc[25000:36335,0:27]

In [441]:
c=completed_df.iloc[200000:221244,0:27]

In [442]:
df_test = pd.concat([notc, c], ignore_index=True)

In [443]:
df_test = df_test.sample(frac=1, random_state=42)

## Machine Learning Model Development on 200k Imbalanced Dataset (SMOTE)

In [444]:
from imblearn.over_sampling import SMOTE
from collections import Counter

In [445]:
X= df_trial.drop('Study Status',axis = 1)
y = df_trial['Study Status']

In [446]:
print("Original dataset shape %s" % Counter(y))

smote = SMOTE(sampling_strategy='minority') 
X_smote, y_smote = smote.fit_resample(X, y)

print("Resampled dataset shape %s" % Counter(y_smote))

Original dataset shape Counter({1: 175000, 0: 25000})
Resampled dataset shape Counter({1: 175000, 0: 175000})


In [447]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_smote,y_smote,test_size=0.3,random_state=1)

In [450]:
from sklearn import metrics

<span style="color: blue; ">Gradient Boosting Classifiers (GBC), is chosen due to its ability to handle complex relationships within data. GBC excels in accuracy, flexibility, and robustness. Its inherent mechanisms like early stopping and regularization prevent overfitting, ensuring reliable predictions on unseen data. Moreover, GBC's adaptability to different loss functions allows for customization to specific classification needs."</span>

In [448]:
from sklearn.ensemble import GradientBoostingClassifier
GB_model=GradientBoostingClassifier(n_estimators=100,random_state=1)
GB_model.fit(X_train,y_train)

In [451]:
y_train_predict = GB_model.predict(X_train)
model_score = GB_model.score(X_train,y_train)
print(model_score)
print()
print(metrics.confusion_matrix (y_train,y_train_predict) )
print(metrics.classification_report(y_train,y_train_predict))

0.9050938775510204

[[105748  16818]
 [  6434 116000]]
              precision    recall  f1-score   support

           0       0.94      0.86      0.90    122566
           1       0.87      0.95      0.91    122434

    accuracy                           0.91    245000
   macro avg       0.91      0.91      0.90    245000
weighted avg       0.91      0.91      0.90    245000



In [452]:
y_test_predict =GB_model.predict(X_test)
model_score = GB_model.score(X_test,y_test)
print(model_score)
print()
print(metrics.confusion_matrix (y_test,y_test_predict) )
print(metrics.classification_report(y_test,y_test_predict))

0.9039904761904762

[[45171  7263]
 [ 2818 49748]]
              precision    recall  f1-score   support

           0       0.94      0.86      0.90     52434
           1       0.87      0.95      0.91     52566

    accuracy                           0.90    105000
   macro avg       0.91      0.90      0.90    105000
weighted avg       0.91      0.90      0.90    105000



# Hyperparameter Optimization

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'n_estimators': [50, 100,150, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7,9],
    'min_samples_split': [2, 5, 10,15],
    'min_samples_leaf': [1, 2, 4,6]
}

In [None]:
grid_search = GridSearchCV(estimator=GB_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [None]:
y_train_predict = best_model.predict(X_train)
model_score = best_model.score(X_train,y_train)
print(model_score)
print()
print(metrics.confusion_matrix (y_train,y_train_predict) )
print(metrics.classification_report(y_train,y_train_predict))

In [None]:
y_test_predict =best_model.predict(X_test)
model_score = best_model.score(X_test,y_test)
print(model_score)
print()
print(metrics.confusion_matrix (y_test,y_test_predict) )
print(metrics.classification_report(y_test,y_test_predict))

# Evaluating Model Performance on a Hold-out Set of 55k Unseen Data Points

In [None]:
df_t=df_test.drop(['Study Status'],axis=1)

In [None]:
dy = df_test['Study Status']

In [None]:
new_predictions = best_model.predict(df_t) 

In [None]:
model_score = best_model.score(df_t,dy)
print(model_score)
print()
print(metrics.confusion_matrix (dy,new_predictions) )
print(metrics.classification_report(dy,new_predictions))

In [None]:
import joblib

In [None]:

joblib.dump(best_model, 'gradient_boosting_model.pkl') 

In [None]:
best_model = joblib.load('gradient_boosting_model.pkl')

In [None]:
from joblib import dump

In [None]:
dump(best_model, 'gbc_model.joblib')