In [1]:
import joblib
import numpy as np
import pandas as pd

In [2]:
df = pd.read_excel("Clinical-samples-test-data.xlsx")

In [3]:
file1 = pd.read_excel("russel_lab_AA_db.xlsx")

In [4]:
# Merge the dataframes on 'Main_AA' and 'Sub_AA' columns using a left join
merged_df = pd.merge(df, file1, on=['Main_AA', 'Sub_AA'], how='left', suffixes=('_x', '_y'))

def handle_unknown_aa(row):
    # Ensure 'Unknown' is detected even if there are leading/trailing spaces
    if 'Unknown' in str(row['Main_AA']).strip() or 'Unknown' in str(row['Sub_AA']).strip():
        row['Substitution_Type'] = 'NA'
        row['Score'] = 'NA'
    return row

# Apply the function to set Substitution_Type and Score to 'NA' if Sub_AA contains "Unknown"
merged_df = merged_df.apply(handle_unknown_aa, axis=1)

# Add the 'Substitution_Pref' column based on the prefix in 'Substitution_Type'
def get_substitution_pref(sub_type):
    if pd.isnull(sub_type) or sub_type == 'NA':
        return 'NA'
    if sub_type.startswith('APT'):
        return 'All protein types'  # Only keep APT substitutions
    return 'NA'  # Default to 'NA' for any non-APT type

merged_df['Substitution_Pref'] = merged_df['Substitution_Type'].apply(get_substitution_pref)

# Add the 'Substitution_Nature' column based on the suffix in 'Substitution_Type'
def get_substitution_nature(sub_type):
    if pd.isnull(sub_type) or sub_type == 'NA':
        return 'NA'
    if sub_type.endswith('DF'):
        return 'Disfavoured'
    elif sub_type.endswith('F'):
        return 'Favoured'
    elif sub_type.endswith('N'):
        return 'Neutral'
    else:
        return 'NA'

merged_df['Substitution_Nature'] = merged_df['Substitution_Type'].apply(get_substitution_nature)

In [5]:
# Strip extra spaces from the values in the 'clinvar: Clinvar' column
merged_df['clinvar:_Clinvar'] = merged_df['clinvar:_Clinvar'].astype(str).str.strip()

# Rearrangement of clinvar column
# values to be replaced and the new value
replace_dict = {
    'clinvar: UNK': 'VUS',
    'clinvar: other': 'VUS',
    'clinvar: Conflicting_interpretations_of_pathogenicity': 'VUS',
    'clinvar: Conflicting_interpretations_of_pathogenicity,_other': 'VUS',
    'clinvar: Conflicting_interpretations_of_pathogenicity,_association': 'VUS',
    'clinvar: Conflicting_interpretations_of_pathogenicity,_risk_factor': 'VUS',
    'clinvar: Conflicting_interpretations_of_pathogenicity,_other,_risk_factor': 'VUS',
    'clinvar: Conflicting_interpretations_of_pathogenicity,_association,_other,_risk_factor': 'VUS',
    'clinvar: Uncertain_significance|drug_response': 'VUS',
    'clinvar: Uncertain_significance': 'VUS',
    'clinvar: Pathogenic/Likely_pathogenic': 'Pathogenic',
    'clinvar: Pathogenic,_other,_risk_factor': 'Pathogenic',
    'clinvar: Pathogenic/Likely_pathogenic,_risk_factor': 'Pathogenic',
    'clinvar: Pathogenic,_risk_factor': 'Pathogenic',
    'clinvar: Pathogenic/Likely_pathogenic,_other': 'Pathogenic',
    'clinvar: drug_response': 'Pathogenic',
    'clinvar: Pathogenic': 'Pathogenic',
    'clinvar: Likely_pathogenic': 'Pathogenic',
    'clinvar: Likely_pathogenic,_risk_factor': 'Pathogenic',
    'clinvar: Pathogenic|drug_response|other': 'Pathogenic',
    'clinvar: Pathogenic/Likely_pathogenic|drug_response': 'Pathogenic',
    'clinvar: Pathogenic/Likely_pathogenic|other': 'Pathogenic',
    'clinvar: Pathogenic/Likely_pathogenic|association|other': 'Pathogenic',
    'clinvar: Likely_pathogenic|risk_factor': 'Pathogenic',
    'clinvar: Benign': 'Benign',
    'clinvar: Likely_benign': 'Benign',
    'clinvar: not_provided': 'VUS',
    'clinvar: Benign/Likely_benign': 'Benign',
}

# Replace the values in the 'clinvar' column
merged_df['clinvar:_Clinvar'] = merged_df['clinvar:_Clinvar'].replace(replace_dict)

In [6]:
# Strip extra spaces from the values in the 'clinvar: Clinvar' column
merged_df['InterVar_automated'] = merged_df['InterVar_automated'].astype(str).str.strip()

# Rearrangement of clinvar column
# values to be replaced and the new value
replace_dict = {
'PATHOGENIC' : 'Pathogenic',
'LIKELY_PATHOGENIC': 'Pathogenic',
'LIKELY_BENIGN': 'Benign',
'Likely_benign': 'Benign',
'BENIGN': 'Benign',
'UNCERTAIN_SIGNIFICANCE': 'VUS',
'Uncertain_significance': 'VUS',
'.': 'VUS'

}

# Replace the values in the 'clinvar' column
merged_df['InterVar_automated'] = merged_df['InterVar_automated'].replace(replace_dict)

In [7]:
# Step 2: Replace spaces in column names with underscores
merged_df.columns = merged_df.columns.str.replace(' ', '_')

In [8]:
# Remove rows where REF_x or ALT_x has a value of "."
df = merged_df[(merged_df['REF_x'] != '.') & (merged_df['ALT_x'] != '.')]

In [9]:
df_sift = pd.get_dummies(merged_df.SIFT_pred)

In [10]:
# Rename columns
df_sift.rename(columns={
    'D': 'SIFT_Deleterious',
    'T': 'SIFT_Tolerated',
    '.': 'SIFT_Unknown'
}, inplace=True)

In [11]:
merged = pd.concat([merged_df,df_sift],axis = 'columns')

In [12]:
df_polyphen = pd.get_dummies(merged_df.Polyphen2_HVAR_pred)

In [13]:
# Rename columns
df_polyphen.rename(columns={
    'D': 'Polyphen_Damaging',
    'B': 'Polyphen_Benign',
    '.': 'Polyphen_Unknown',
    'P': 'Polyphen_Possibly_damaging'
}, inplace=True)

In [14]:
merged = pd.concat([merged,df_polyphen],axis = 'columns')

In [15]:
df_MT = pd.get_dummies(merged.MutationTaster_pred)

In [16]:
# Rename columns
df_MT.rename(columns={
    'A': 'MutationTaster_Known_deleterious',
    'N': 'MutationTaster_Probably_harmless',
    'P': 'MutationTaster_Knowntobeharmless',
    '.': 'MutationTaster_Unknown',
    'D': 'MutationTaster_Probably_deleterious'
}, inplace=True)

In [17]:
merged = pd.concat([merged,df_MT],axis = 'columns')

In [18]:
final =  merged.drop(['SIFT_pred','Polyphen2_HVAR_pred','MutationTaster_pred'], axis = 'columns')

In [19]:
clin = pd.get_dummies(final['clinvar:_Clinvar'])

In [20]:
# Rename columns
clin.rename(columns={
    'Benign': 'Clinvar_Benign',
    'Pathogenic': 'Clinvar_Pathogenic',
    'VUS': 'Clinvar_VUS'
}, inplace=True)

In [21]:
merged1 = pd.concat([final,clin],axis = 'columns')

In [22]:
intervar = pd.get_dummies(merged1['InterVar_automated'])

In [23]:
# Rename columns
intervar.rename(columns={
    'Benign': 'Intervar_Benign',
    'Pathogenic': 'Intervar_Pathogenic',
    'VUS': 'Intervar_VUS'
}, inplace=True)

In [24]:
merged1 = pd.concat([merged1,intervar],axis = 'columns')

In [25]:
final1 =  merged1.drop(['clinvar:_Clinvar', 'InterVar_automated'], axis = 'columns')

In [26]:
rl_sub_pref = pd.get_dummies(final1['Substitution_Pref'])

In [41]:
rl_sub_pref

Unnamed: 0,Substitution_Pref_All protein types,Substitution_Pref_NA
0,False,True
1,False,True
2,False,True
3,False,True
4,False,True
...,...,...
302,True,False
303,False,True
304,True,False
305,True,False


In [27]:
# Rename columns
rl_sub_pref.rename(columns={
    'All protein types': 'Substitution_Pref_All protein types',
    #'Extracellular proteins': 'RL_Extracellular_proteins',
    #'Intracellular proteins': 'RL_Intracellular_proteins',
    #'Membrane proteins': 'RL_Membrane_proteins',
    'NA': 'Substitution_Pref_NA'
}, inplace=True)

In [28]:
merged_RL1 = pd.concat([final1,rl_sub_pref],axis = 'columns')

In [29]:
merged_RL1 =  merged_RL1.drop(['Substitution_Type', 'Score', 'Substitution_Pref'], axis = 'columns')

In [30]:
rl_sub_nature = pd.get_dummies(merged_RL1['Substitution_Nature'])

In [31]:
# Rename columns
rl_sub_nature.rename(columns={
    'Disfavoured': 'Substitution_Nature_Disfavoured',
    'Favoured': 'Substitution_Nature_Favoured',
    'NA': 'Substitution_Nature_NA',
    'Neutral': 'Substitution_Nature_Neutral'
}, inplace=True)

In [32]:
merged_RL2 = pd.concat([merged_RL1,rl_sub_nature],axis = 'columns')

In [33]:
merged_RL2 =  merged_RL2.drop(['Substitution_Nature'], axis = 'columns')

In [35]:
# Normalize the 'ExonicFunc.ensGene' column to lowercase
merged_RL2['ExonicFunc.ensGene'] = merged_RL2['ExonicFunc.ensGene'].str.lower()

In [36]:
# Example: One-hot encoding for 'Ref.Gene' if it is a categorical column
merged_RL2 = pd.get_dummies(merged_RL2, columns=['Func.ensGene', 'ExonicFunc.ensGene','Ref.Gene'])

In [37]:
merged_RL2 = merged_RL2.drop(['CHROM_x','POS_x', 'End_x', 'REF_x','ALT_x','AA_Change'], axis = 'columns')

In [38]:
# Example: One-hot encoding for 'Ref.Gene' if it is a categorical column
merged_RL2 = pd.get_dummies(merged_RL2, columns=['Main_AA', 'Sub_AA'])

In [39]:
# Convert boolean columns to integer (0, 1)
bool_cols = merged_RL2.select_dtypes(include='bool').columns
merged_RL2[bool_cols] = merged_RL2[bool_cols].astype(int)

In [45]:
# Print all column names with their data types
for col in merged_RL2.columns:
    print(f"{col}: {merged_RL2[col].dtype}")

SIFT_Unknown: int32
SIFT_Deleterious: int32
SIFT_Tolerated: int32
Polyphen_Unknown: int32
Polyphen_Benign: int32
Polyphen_Damaging: int32
Polyphen_Possibly_damaging: int32
MutationTaster_Unknown: int32
MutationTaster_Known_deleterious: int32
MutationTaster_Probably_deleterious: int32
MutationTaster_Probably_harmless: int32
Clinvar_Benign: int32
Clinvar_Pathogenic: int32
Clinvar_VUS: int32
Intervar_Benign: int32
Intervar_Pathogenic: int32
Intervar_VUS: int32
Substitution_Pref_All protein types: int32
Substitution_Pref_NA: int32
Substitution_Nature_Disfavoured: int32
Substitution_Nature_Favoured: int32
Substitution_Nature_NA: int32
Substitution_Nature_Neutral: int32
Func.ensGene_exonic: int32
Func.ensGene_splicing: int32
ExonicFunc.ensGene_.: int32
ExonicFunc.ensGene_frameshift deletion: int32
ExonicFunc.ensGene_frameshift insertion: int32
ExonicFunc.ensGene_nonframeshift deletion: int32
ExonicFunc.ensGene_nonframeshift insertion: int32
ExonicFunc.ensGene_nonsynonymous snv: int32
ExonicF

In [46]:
trained_columns

['SIFT_Unknown',
 'SIFT_Deleterious',
 'SIFT_Tolerated',
 'Polyphen_Unknown',
 'Polyphen_Benign',
 'Polyphen_Damaging',
 'Polyphen_Possibly_damaging',
 'MutationTaster_Unknown',
 'MutationTaster_Known_deleterious',
 'MutationTaster_Probably_deleterious',
 'MutationTaster_Probably_harmless',
 'MutationTaster_Knowntobeharmless',
 'Clinvar_Benign',
 'Clinvar_Pathogenic',
 'Clinvar_VUS',
 'Intervar_Benign',
 'Intervar_Pathogenic',
 'Intervar_VUS',
 'RL_All_Protein_types',
 'RL_Disfavoured',
 'RL_Favoured',
 'NA_RL',
 'RL_Neutral',
 'Func.ensGene_UTR3',
 'Func.ensGene_UTR5',
 'Func.ensGene_exonic',
 'Func.ensGene_exonic;splicing',
 'Func.ensGene_intronic',
 'Func.ensGene_ncRNA_intronic',
 'Func.ensGene_splicing',
 'ExonicFunc.ensGene_.',
 'ExonicFunc.ensGene_frameshift deletion',
 'ExonicFunc.ensGene_frameshift insertion',
 'ExonicFunc.ensGene_nonframeshift deletion',
 'ExonicFunc.ensGene_nonframeshift insertion',
 'ExonicFunc.ensGene_nonframeshift substitution',
 'ExonicFunc.ensGene_nonsyn

In [43]:
# Load the trained model
model_LR = joblib.load('C:/Users/HP/Downloads/TID-Work/saved_models/Target-2/LR_model.pkl')

import json
# Load the trained columns
with open('C:/Users/HP/Downloads/TID-Work/saved_models/Target-1/trained_columns.json', 'r') as f:
    trained_columns = json.load(f)

# Check missing and extra columns
missing_columns = [col for col in trained_columns if col not in merged_RL2.columns]
extra_columns = [col for col in merged_RL2.columns if col not in trained_columns]

print("Missing columns:", missing_columns)
print("Extra columns:", extra_columns)

# Add missing columns with default values (e.g., 0)
for col in missing_columns:
    merged_RL2[col] = 0

# Drop extra columns
test_data = merged_RL2.drop(columns=extra_columns)

# Reorder columns to match trained model's feature order
test_data = test_data[trained_columns]

# Verify columns match before prediction
print("Columns in test data after alignment:", test_data.columns)

# Step 1: Get prediction probabilities using the trained model
pred_probabilities = model_LR.predict_proba(test_data)

# Step 2: Make predictions for the classes (get the class index with highest probability)
predictions = model_LR.predict(test_data)

Missing columns: ['MutationTaster_Knowntobeharmless', 'Func.ensGene_UTR3', 'Func.ensGene_UTR5', 'Func.ensGene_exonic;splicing', 'Func.ensGene_intronic', 'Func.ensGene_ncRNA_intronic', 'ExonicFunc.ensGene_nonframeshift substitution', 'ExonicFunc.ensGene_stoploss', 'ExonicFunc.ensGene_synonymous snv']
Extra columns: ['RL_NA', 'Ref.Gene_ALK', 'Ref.Gene_APC', 'Ref.Gene_AR', 'Ref.Gene_ATM', 'Ref.Gene_BARD1', 'Ref.Gene_BMPR1A', 'Ref.Gene_BRAF', 'Ref.Gene_BRCA1', 'Ref.Gene_BRCA2', 'Ref.Gene_BRIP1', 'Ref.Gene_CDK12', 'Ref.Gene_CDKN2A', 'Ref.Gene_CHEK1', 'Ref.Gene_CHEK2', 'Ref.Gene_CTNNB1', 'Ref.Gene_EGFR', 'Ref.Gene_ERBB2', 'Ref.Gene_FANCL', 'Ref.Gene_FGFR1', 'Ref.Gene_FGFR2', 'Ref.Gene_FGFR3', 'Ref.Gene_GAPDH', 'Ref.Gene_IDH2', 'Ref.Gene_JAK2', 'Ref.Gene_KIT', 'Ref.Gene_KRAS', 'Ref.Gene_MAP2K2', 'Ref.Gene_MDM2', 'Ref.Gene_MET', 'Ref.Gene_MLH1', 'Ref.Gene_MLH3', 'Ref.Gene_MSH2', 'Ref.Gene_MSH6', 'Ref.Gene_MUTYH', 'Ref.Gene_NTRK1', 'Ref.Gene_PALB2', 'Ref.Gene_PDGFRA', 'Ref.Gene_PIK3CA', 'Ref.Ge

  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0


In [40]:
# Load the trained model
model_LR = joblib.load('C:/Users/HP/Downloads/TID-Work/saved_models/Target-2/LR_model.pkl')

import json
# Load the trained columns
with open('C:/Users/HP/Downloads/TID-Work/saved_models/Target-1/trained_columns.json', 'r') as f:
    trained_columns = json.load(f)

# Check missing and extra columns
missing_columns = [col for col in trained_columns if col not in merged_RL2.columns]
extra_columns = [col for col in merged_RL2.columns if col not in trained_columns]

print("Missing columns:", missing_columns)
print("Extra columns:", extra_columns)

# Add missing columns with default values (e.g., 0)
for col in missing_columns:
    merged_RL2[col] = 0

# Drop extra columns
test_data = merged_RL2.drop(columns=extra_columns)

# Reorder columns to match trained model's feature order
test_data = test_data[trained_columns]

# Verify columns match before prediction
print("Columns in test data after alignment:", test_data.columns)

# Step 1: Get prediction probabilities using the trained model
pred_probabilities = model_LR.predict_proba(test_data)

# Step 2: Make predictions for the classes (get the class index with highest probability)
predictions = model_LR.predict(test_data)

Missing columns: ['MutationTaster_Knowntobeharmless', 'RL_All_Protein_types', 'RL_Disfavoured', 'RL_Favoured', 'NA_RL', 'RL_Neutral', 'Func.ensGene_UTR3', 'Func.ensGene_UTR5', 'Func.ensGene_exonic;splicing', 'Func.ensGene_intronic', 'Func.ensGene_ncRNA_intronic', 'ExonicFunc.ensGene_nonframeshift substitution', 'ExonicFunc.ensGene_stoploss', 'ExonicFunc.ensGene_synonymous snv']
Extra columns: ['Substitution_Pref_All protein types', 'Substitution_Pref_NA', 'Substitution_Nature_Disfavoured', 'Substitution_Nature_Favoured', 'Substitution_Nature_NA', 'Substitution_Nature_Neutral', 'Ref.Gene_ALK', 'Ref.Gene_APC', 'Ref.Gene_AR', 'Ref.Gene_ATM', 'Ref.Gene_BARD1', 'Ref.Gene_BMPR1A', 'Ref.Gene_BRAF', 'Ref.Gene_BRCA1', 'Ref.Gene_BRCA2', 'Ref.Gene_BRIP1', 'Ref.Gene_CDK12', 'Ref.Gene_CDKN2A', 'Ref.Gene_CHEK1', 'Ref.Gene_CHEK2', 'Ref.Gene_CTNNB1', 'Ref.Gene_EGFR', 'Ref.Gene_ERBB2', 'Ref.Gene_FANCL', 'Ref.Gene_FGFR1', 'Ref.Gene_FGFR2', 'Ref.Gene_FGFR3', 'Ref.Gene_GAPDH', 'Ref.Gene_IDH2', 'Ref.Gene_J

  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0
  merged_RL2[col] = 0


In [None]:
# Step 3: If predictions are one-hot encoded, use np.argmax() to get the class index
if predictions.ndim > 1:
    predictions = np.argmax(predictions, axis=1)  # Get the index of the predicted class (highest probability)

# Step 4: Get the probabilities for the predicted class
predicted_probabilities = [pred_probabilities[i, int(pred)] for i, pred in enumerate(predictions)]

# Step 5: Map predictions (numeric) to class labels
class_labels = {0: 'Benign', 1: 'Pathogenic', 2: 'VUS'}
predicted_classes = [class_labels[int(pred)] for pred in predictions]  # pred is the predicted class index

# Step 6: Add predictions and probabilities to the test data
test_data['Predicted_Class'] = predicted_classes
test_data['Predicted_Probability'] = predicted_probabilities

# Step 7: Output the results to Excel
test_data.to_excel("Predicted_Classes_for_Test_Data_LR.xlsx", index=False) 

# Optionally, display the predictions with probabilities
print(test_data[['Predicted_Class', 'Predicted_Probability']].head())

In [43]:
# Load the trained model
model_LR_smote = joblib.load('C:/Users/HP/Downloads/TID-Work/saved_models/LR_model_smote.pkl')

import json
# Load the trained columns
with open('C:/Users/HP/Downloads/TID-Work/saved_models/trained_columns.json', 'r') as f:
    trained_columns = json.load(f)

# Check missing and extra columns
missing_columns = [col for col in trained_columns if col not in merged_RL2.columns]
extra_columns = [col for col in merged_RL2.columns if col not in trained_columns]

print("Missing columns:", missing_columns)
print("Extra columns:", extra_columns)

# Add missing columns with default values (e.g., 0)
for col in missing_columns:
    merged_RL2[col] = 0

# Drop extra columns
test_data = merged_RL2.drop(columns=extra_columns)

# Reorder columns to match trained model's feature order
test_data = test_data[trained_columns]

# Verify columns match before prediction
print("Columns in test data after alignment:", test_data.columns)

# Step 1: Get prediction probabilities using the trained model
pred_probabilities = model_LR_smote.predict_proba(test_data)

# Step 2: Make predictions for the classes (get the class index with highest probability)
predictions = model_LR_smote.predict(test_data)

# Step 3: If predictions are one-hot encoded, use np.argmax() to get the class index
if predictions.ndim > 1:
    predictions = np.argmax(predictions, axis=1)  # Get the index of the predicted class (highest probability)

# Step 4: Get the probabilities for the predicted class
predicted_probabilities = [pred_probabilities[i, int(pred)] for i, pred in enumerate(predictions)]

# Step 5: Map predictions (numeric) to class labels
class_labels = {0: 'Benign', 1: 'Pathogenic', 2: 'VUS'}
predicted_classes = [class_labels[int(pred)] for pred in predictions]  # pred is the predicted class index

# Step 6: Add predictions and probabilities to the test data
test_data['Predicted_Class'] = predicted_classes
test_data['Predicted_Probability'] = predicted_probabilities

# Step 7: Output the results to Excel
test_data.to_excel("Predicted_Classes_for_Test_Data_LR_smote.xlsx", index=False) 

# Optionally, display the predictions with probabilities
print(test_data[['Predicted_Class', 'Predicted_Probability']].head())

Missing columns: []
Extra columns: ['RL_All_Protein_types', 'RL_NA', 'RL_Disfavoured', 'RL_Favoured', 'NA_RL', 'RL_Neutral', 'Sub_AA_ALa']
Columns in test data after alignment: Index(['SIFT_Unknown', 'SIFT_Deleterious', 'SIFT_Tolerated',
       'Polyphen_Unknown', 'Polyphen_Benign', 'Polyphen_Damaging',
       'Polyphen_Possibly_damaging', 'MutationTaster_Unknown',
       'MutationTaster_Known_deleterious',
       'MutationTaster_Probably_deleterious',
       ...
       'Sub_AA_Unknown', 'Sub_AA_Val', 'Sub_AA_delins_Unknown',
       'Sub_AA_fs_Unknown', 'Substitution_Pref_All protein types',
       'Substitution_Pref_NA', 'Substitution_Nature_Disfavoured',
       'Substitution_Nature_Favoured', 'Substitution_Nature_NA',
       'Substitution_Nature_Neutral'],
      dtype='object', length=165)


  test_data['Predicted_Class'] = predicted_classes
  test_data['Predicted_Probability'] = predicted_probabilities


  Predicted_Class  Predicted_Probability
0      Pathogenic               0.999741
1          Benign               0.908803
2          Benign               0.956938
3             VUS               0.647669
4          Benign               0.998262


In [44]:
# Load the trained model
model_KNN = joblib.load('C:/Users/HP/Downloads/TID-Work/saved_models/KNN_model.pkl')

import json
# Load the trained columns
with open('C:/Users/HP/Downloads/TID-Work/saved_models/trained_columns.json', 'r') as f:
    trained_columns = json.load(f)

# Check missing and extra columns
missing_columns = [col for col in trained_columns if col not in merged_RL2.columns]
extra_columns = [col for col in merged_RL2.columns if col not in trained_columns]

print("Missing columns:", missing_columns)
print("Extra columns:", extra_columns)

# Add missing columns with default values (e.g., 0)
for col in missing_columns:
    merged_RL2[col] = 0

# Drop extra columns
test_data = merged_RL2.drop(columns=extra_columns)

# Reorder columns to match trained model's feature order
test_data = test_data[trained_columns]

# Verify columns match before prediction
print("Columns in test data after alignment:", test_data.columns)

# Step 1: Get prediction probabilities using the trained model
pred_probabilities = model_KNN.predict_proba(test_data)

# Step 2: Make predictions for the classes (get the class index with highest probability)
predictions = model_KNN.predict(test_data)

# Check the shape of predictions to see if it's a one-hot encoded array
print("Predictions shape:", predictions.shape)  # Debugging line
print("Predictions sample:", predictions[:5])  # Debugging line

# Step 3: If predictions are one-hot encoded, use np.argmax() to get the class index
if predictions.ndim > 1:
    predictions = np.argmax(predictions, axis=1)  # Get the index of the predicted class (highest probability)

# Step 4: Get the probabilities for the predicted class
predicted_probabilities = [pred_probabilities[i, int(pred)] for i, pred in enumerate(predictions)]

# Step 5: Map predictions (numeric) to class labels
class_labels = {0: 'Benign', 1: 'Pathogenic', 2: 'VUS'}
predicted_classes = [class_labels[int(pred)] for pred in predictions]  # `pred` is the predicted class index

# Step 6: Add predictions and probabilities to the test data
test_data['Predicted_Class'] = predicted_classes
test_data['Predicted_Probability'] = predicted_probabilities

# Step 7: Output the results to Excel
test_data.to_excel("Predicted_Classes_for_Test_Data_KNN.xlsx", index=False)

# Optionally, display the predictions with probabilities
print(test_data[['Predicted_Class', 'Predicted_Probability']].head())

Missing columns: []
Extra columns: ['RL_All_Protein_types', 'RL_NA', 'RL_Disfavoured', 'RL_Favoured', 'NA_RL', 'RL_Neutral', 'Sub_AA_ALa']
Columns in test data after alignment: Index(['SIFT_Unknown', 'SIFT_Deleterious', 'SIFT_Tolerated',
       'Polyphen_Unknown', 'Polyphen_Benign', 'Polyphen_Damaging',
       'Polyphen_Possibly_damaging', 'MutationTaster_Unknown',
       'MutationTaster_Known_deleterious',
       'MutationTaster_Probably_deleterious',
       ...
       'Sub_AA_Unknown', 'Sub_AA_Val', 'Sub_AA_delins_Unknown',
       'Sub_AA_fs_Unknown', 'Substitution_Pref_All protein types',
       'Substitution_Pref_NA', 'Substitution_Nature_Disfavoured',
       'Substitution_Nature_Favoured', 'Substitution_Nature_NA',
       'Substitution_Nature_Neutral'],
      dtype='object', length=165)
Predictions shape: (307, 3)
Predictions sample: [[0 1 0]
 [1 0 0]
 [1 0 0]
 [0 0 1]
 [1 0 0]]


  test_data['Predicted_Class'] = predicted_classes
  test_data['Predicted_Probability'] = predicted_probabilities


  Predicted_Class  Predicted_Probability
0      Pathogenic                    1.0
1          Benign                    1.0
2          Benign                    1.0
3             VUS                    1.0
4          Benign                    1.0


In [46]:
# Load the trained model
model_KNN_smote = joblib.load('C:/Users/HP/Downloads/TID-Work/saved_models/KNN_model_smote.pkl')

import json
# Load the trained columns
with open('C:/Users/HP/Downloads/TID-Work/saved_models/trained_columns.json', 'r') as f:
    trained_columns = json.load(f)

# Check missing and extra columns
missing_columns = [col for col in trained_columns if col not in merged_RL2.columns]
extra_columns = [col for col in merged_RL2.columns if col not in trained_columns]

print("Missing columns:", missing_columns)
print("Extra columns:", extra_columns)

# Add missing columns with default values (e.g., 0)
for col in missing_columns:
    merged_RL2[col] = 0

# Drop extra columns
test_data = merged_RL2.drop(columns=extra_columns)

# Reorder columns to match trained model's feature order
test_data = test_data[trained_columns]

# Verify columns match before prediction
print("Columns in test data after alignment:", test_data.columns)

# Step 1: Get prediction probabilities using the trained model
pred_probabilities = model_KNN_smote.predict_proba(test_data)

# Step 2: Make predictions for the classes (get the class index with highest probability)
predictions = model_KNN_smote.predict(test_data)

# Check the shape of predictions to see if it's a one-hot encoded array
print("Predictions shape:", predictions.shape)  # Debugging line
print("Predictions sample:", predictions[:5])  # Debugging line

# Step 3: If predictions are one-hot encoded, use np.argmax() to get the class index
if predictions.ndim > 1:
    predictions = np.argmax(predictions, axis=1)  # Get the index of the predicted class (highest probability)

# Step 4: Get the probabilities for the predicted class
predicted_probabilities = [pred_probabilities[i, int(pred)] for i, pred in enumerate(predictions)]

# Step 5: Map predictions (numeric) to class labels
class_labels = {0: 'Benign', 1: 'Pathogenic', 2: 'VUS'}
predicted_classes = [class_labels[int(pred)] for pred in predictions]  # `pred` is the predicted class index

# Step 6: Add predictions and probabilities to the test data
test_data['Predicted_Class'] = predicted_classes
test_data['Predicted_Probability'] = predicted_probabilities

# Step 7: Output the results to Excel
test_data.to_excel("Predicted_Classes_for_Test_Data_KNN_smote_prob.xlsx", index=False)

# Optionally, display the predictions with probabilities
print(test_data[['Predicted_Class', 'Predicted_Probability']].head())

Missing columns: []
Extra columns: ['RL_All_Protein_types', 'RL_NA', 'RL_Disfavoured', 'RL_Favoured', 'NA_RL', 'RL_Neutral', 'Sub_AA_ALa']
Columns in test data after alignment: Index(['SIFT_Unknown', 'SIFT_Deleterious', 'SIFT_Tolerated',
       'Polyphen_Unknown', 'Polyphen_Benign', 'Polyphen_Damaging',
       'Polyphen_Possibly_damaging', 'MutationTaster_Unknown',
       'MutationTaster_Known_deleterious',
       'MutationTaster_Probably_deleterious',
       ...
       'Sub_AA_Unknown', 'Sub_AA_Val', 'Sub_AA_delins_Unknown',
       'Sub_AA_fs_Unknown', 'Substitution_Pref_All protein types',
       'Substitution_Pref_NA', 'Substitution_Nature_Disfavoured',
       'Substitution_Nature_Favoured', 'Substitution_Nature_NA',
       'Substitution_Nature_Neutral'],
      dtype='object', length=165)
Predictions shape: (307, 3)
Predictions sample: [[0 1 0]
 [1 0 0]
 [1 0 0]
 [0 0 1]
 [1 0 0]]


  test_data['Predicted_Class'] = predicted_classes
  test_data['Predicted_Probability'] = predicted_probabilities


  Predicted_Class  Predicted_Probability
0      Pathogenic                    1.0
1          Benign                    1.0
2          Benign                    1.0
3             VUS                    1.0
4          Benign                    1.0


In [47]:
model_DT = joblib.load('C:/Users/HP/Downloads/TID-Work/saved_models/DT_model.pkl')

import json
# Load the trained columns
with open('C:/Users/HP/Downloads/TID-Work/saved_models/trained_columns.json', 'r') as f:
    trained_columns = json.load(f)

# Check missing and extra columns
missing_columns = [col for col in trained_columns if col not in merged_RL2.columns]
extra_columns = [col for col in merged_RL2.columns if col not in trained_columns]

print("Missing columns:", missing_columns)
print("Extra columns:", extra_columns)

# Add missing columns with default values (e.g., 0)
for col in missing_columns:
    merged_RL2[col] = 0

# Drop extra columns
test_data = merged_RL2.drop(columns=extra_columns)

# Reorder columns to match trained model's feature order
test_data = test_data[trained_columns]

# Verify columns match before prediction
print("Columns in test data after alignment:", test_data.columns)

# Step 1: Get prediction probabilities using the trained model
pred_probabilities = model_DT.predict_proba(test_data)

# Step 2: Make predictions for the classes (class labels directly)
predictions = model_DT.predict(test_data)

# Check the shape of predictions to see if it's a one-hot encoded array
print("Predictions shape:", predictions.shape)  # Debugging line
print("Predictions sample:", predictions[:5])  # Debugging line

# Step 3: If predictions are one-hot encoded, use np.argmax() to get the class index
if predictions.ndim > 1:
    predictions = np.argmax(predictions, axis=1)  # Get the index of the predicted class (highest probability)

# Step 4: Get the probabilities for the predicted class
predicted_probabilities = [pred_probabilities[i, int(pred)] for i, pred in enumerate(predictions)]

# Step 5: Map predictions (numeric) to class labels
class_labels = {0: 'Benign', 1: 'Pathogenic', 2: 'VUS'}
predicted_classes = [class_labels[int(pred)] for pred in predictions]  # `pred` is the predicted class index

# Step 6: Add predictions and probabilities to the test data
test_data['Predicted_Class'] = predicted_classes
test_data['Predicted_Probability'] = predicted_probabilities

# Step 7: Output the results to Excel
test_data.to_excel("Predicted_Classes_for_Test_Data_DT.xlsx", index=False)

# Optionally, display the predictions with probabilities
print(test_data[['Predicted_Class', 'Predicted_Probability']].head())

Missing columns: []
Extra columns: ['RL_All_Protein_types', 'RL_NA', 'RL_Disfavoured', 'RL_Favoured', 'NA_RL', 'RL_Neutral', 'Sub_AA_ALa']
Columns in test data after alignment: Index(['SIFT_Unknown', 'SIFT_Deleterious', 'SIFT_Tolerated',
       'Polyphen_Unknown', 'Polyphen_Benign', 'Polyphen_Damaging',
       'Polyphen_Possibly_damaging', 'MutationTaster_Unknown',
       'MutationTaster_Known_deleterious',
       'MutationTaster_Probably_deleterious',
       ...
       'Sub_AA_Unknown', 'Sub_AA_Val', 'Sub_AA_delins_Unknown',
       'Sub_AA_fs_Unknown', 'Substitution_Pref_All protein types',
       'Substitution_Pref_NA', 'Substitution_Nature_Disfavoured',
       'Substitution_Nature_Favoured', 'Substitution_Nature_NA',
       'Substitution_Nature_Neutral'],
      dtype='object', length=165)
Predictions shape: (307, 3)
Predictions sample: [[0 1 0]
 [1 0 0]
 [1 0 0]
 [1 0 1]
 [1 0 0]]


  test_data['Predicted_Class'] = predicted_classes
  test_data['Predicted_Probability'] = predicted_probabilities


  Predicted_Class  Predicted_Probability
0      Pathogenic                    1.0
1          Benign                    1.0
2          Benign                    1.0
3          Benign                    1.0
4          Benign                    1.0


In [48]:
model_DT_smote = joblib.load('C:/Users/HP/Downloads/TID-Work/saved_models/DT_model_with_smote.pkl')

import json
# Load the trained columns
with open('C:/Users/HP/Downloads/TID-Work/saved_models/trained_columns.json', 'r') as f:
    trained_columns = json.load(f)

# Check missing and extra columns
missing_columns = [col for col in trained_columns if col not in merged_RL2.columns]
extra_columns = [col for col in merged_RL2.columns if col not in trained_columns]

print("Missing columns:", missing_columns)
print("Extra columns:", extra_columns)

# Add missing columns with default values (e.g., 0)
for col in missing_columns:
    merged_RL2[col] = 0

# Drop extra columns
test_data = merged_RL2.drop(columns=extra_columns)

# Reorder columns to match trained model's feature order
test_data = test_data[trained_columns]

# Verify columns match before prediction
print("Columns in test data after alignment:", test_data.columns)

# Step 1: Get prediction probabilities using the trained model
pred_probabilities = model_DT_smote.predict_proba(test_data)

# Step 2: Make predictions for the classes (class labels directly)
predictions = model_DT_smote.predict(test_data)

# Check the shape of predictions to see if it's a one-hot encoded array
print("Predictions shape:", predictions.shape)  # Debugging line
print("Predictions sample:", predictions[:5])  # Debugging line

# Step 3: If predictions are one-hot encoded, use np.argmax() to get the class index
if predictions.ndim > 1:
    predictions = np.argmax(predictions, axis=1)  # Get the index of the predicted class (highest probability)

# Step 4: Get the probabilities for the predicted class
predicted_probabilities = [pred_probabilities[i, int(pred)] for i, pred in enumerate(predictions)]

# Step 5: Map predictions (numeric) to class labels
class_labels = {0: 'Benign', 1: 'Pathogenic', 2: 'VUS'}
predicted_classes = [class_labels[int(pred)] for pred in predictions]  # `pred` is the predicted class index

# Step 6: Add predictions and probabilities to the test data
test_data['Predicted_Class'] = predicted_classes
test_data['Predicted_Probability'] = predicted_probabilities

# Step 7: Output the results to Excel
test_data.to_excel("Predicted_Classes_for_Test_Data_DT_smote_prob.xlsx", index=False)

# Optionally, display the predictions with probabilities
print(test_data[['Predicted_Class', 'Predicted_Probability']].head())

Missing columns: []
Extra columns: ['RL_All_Protein_types', 'RL_NA', 'RL_Disfavoured', 'RL_Favoured', 'NA_RL', 'RL_Neutral', 'Sub_AA_ALa']
Columns in test data after alignment: Index(['SIFT_Unknown', 'SIFT_Deleterious', 'SIFT_Tolerated',
       'Polyphen_Unknown', 'Polyphen_Benign', 'Polyphen_Damaging',
       'Polyphen_Possibly_damaging', 'MutationTaster_Unknown',
       'MutationTaster_Known_deleterious',
       'MutationTaster_Probably_deleterious',
       ...
       'Sub_AA_Unknown', 'Sub_AA_Val', 'Sub_AA_delins_Unknown',
       'Sub_AA_fs_Unknown', 'Substitution_Pref_All protein types',
       'Substitution_Pref_NA', 'Substitution_Nature_Disfavoured',
       'Substitution_Nature_Favoured', 'Substitution_Nature_NA',
       'Substitution_Nature_Neutral'],
      dtype='object', length=165)
Predictions shape: (307, 3)
Predictions sample: [[0 1 0]
 [1 0 0]
 [1 0 0]
 [1 0 1]
 [1 0 0]]


  test_data['Predicted_Class'] = predicted_classes
  test_data['Predicted_Probability'] = predicted_probabilities


  Predicted_Class  Predicted_Probability
0      Pathogenic                    1.0
1          Benign                    1.0
2          Benign                    1.0
3          Benign                    1.0
4          Benign                    1.0


In [49]:
model_RF = joblib.load('C:/Users/HP/Downloads/TID-Work/saved_models/RF_model.pkl')

import json
# Load the trained columns
with open('C:/Users/HP/Downloads/TID-Work/saved_models/trained_columns.json', 'r') as f:
    trained_columns = json.load(f)

# Check missing and extra columns
missing_columns = [col for col in trained_columns if col not in merged_RL2.columns]
extra_columns = [col for col in merged_RL2.columns if col not in trained_columns]

print("Missing columns:", missing_columns)
print("Extra columns:", extra_columns)

# Add missing columns with default values (e.g., 0)
for col in missing_columns:
    merged_RL2[col] = 0

# Drop extra columns
test_data = merged_RL2.drop(columns=extra_columns)

# Reorder columns to match trained model's feature order
test_data = test_data[trained_columns]

# Verify columns match before prediction
print("Columns in test data after alignment:", test_data.columns)

# Step 1: Get prediction probabilities using the trained model
pred_probabilities = model_RF.predict_proba(test_data)

# Step 2: Make predictions for the classes (class labels directly)
predictions = model_RF.predict(test_data)

# Step 3: If predictions are in a one-hot encoded format, use np.argmax() to get the class index
if predictions.ndim > 1:
    predictions = np.argmax(predictions, axis=1)  # Get the index of the highest probability for each sample

# Step 4: Get the probabilities for the predicted class
predicted_probabilities = [pred_probabilities[i, int(pred)] for i, pred in enumerate(predictions)]

# Step 5: Map predictions (numeric) to class labels
class_labels = {0: 'Benign', 1: 'Pathogenic', 2: 'VUS'}
predicted_classes = [class_labels[int(pred)] for pred in predictions]  # `pred` is the predicted class index

# Step 6: Add predictions and probabilities to the test data
test_data['Predicted_Class'] = predicted_classes
test_data['Predicted_Probability'] = predicted_probabilities

# Step 7: Output the results to Excel
test_data.to_excel("Predicted_Classes_for_Test_Data_RF.xlsx", index=False)

# Optionally, display the predictions with probabilities
print(test_data[['Predicted_Class', 'Predicted_Probability']].head())

Missing columns: []
Extra columns: ['RL_All_Protein_types', 'RL_NA', 'RL_Disfavoured', 'RL_Favoured', 'NA_RL', 'RL_Neutral', 'Sub_AA_ALa']
Columns in test data after alignment: Index(['SIFT_Unknown', 'SIFT_Deleterious', 'SIFT_Tolerated',
       'Polyphen_Unknown', 'Polyphen_Benign', 'Polyphen_Damaging',
       'Polyphen_Possibly_damaging', 'MutationTaster_Unknown',
       'MutationTaster_Known_deleterious',
       'MutationTaster_Probably_deleterious',
       ...
       'Sub_AA_Unknown', 'Sub_AA_Val', 'Sub_AA_delins_Unknown',
       'Sub_AA_fs_Unknown', 'Substitution_Pref_All protein types',
       'Substitution_Pref_NA', 'Substitution_Nature_Disfavoured',
       'Substitution_Nature_Favoured', 'Substitution_Nature_NA',
       'Substitution_Nature_Neutral'],
      dtype='object', length=165)


  test_data['Predicted_Class'] = predicted_classes
  test_data['Predicted_Probability'] = predicted_probabilities


  Predicted_Class  Predicted_Probability
0      Pathogenic               0.880000
1          Benign               1.000000
2          Benign               0.756000
3             VUS               0.636667
4          Benign               1.000000


In [50]:
model_RF_smote = joblib.load('C:/Users/HP/Downloads/TID-Work/saved_models/RF_model_with_smote.pkl')

import json
# Load the trained columns
with open('C:/Users/HP/Downloads/TID-Work/saved_models/trained_columns.json', 'r') as f:
    trained_columns = json.load(f)

# Check missing and extra columns
missing_columns = [col for col in trained_columns if col not in merged_RL2.columns]
extra_columns = [col for col in merged_RL2.columns if col not in trained_columns]

print("Missing columns:", missing_columns)
print("Extra columns:", extra_columns)

# Add missing columns with default values (e.g., 0)
for col in missing_columns:
    merged_RL2[col] = 0

# Drop extra columns
test_data = merged_RL2.drop(columns=extra_columns)

# Reorder columns to match trained model's feature order
test_data = test_data[trained_columns]

# Verify columns match before prediction
print("Columns in test data after alignment:", test_data.columns)

# Step 1: Get prediction probabilities using the trained model
pred_probabilities = model_RF_smote.predict_proba(test_data)

# Step 2: Make predictions for the classes (class labels directly)
predictions = model_RF_smote.predict(test_data)

# Step 3: If predictions are in a one-hot encoded format, use np.argmax() to get the class index
if predictions.ndim > 1:
    predictions = np.argmax(predictions, axis=1)  # Get the index of the highest probability for each sample

# Step 4: Get the probabilities for the predicted class
predicted_probabilities = [pred_probabilities[i, int(pred)] for i, pred in enumerate(predictions)]

# Step 5: Map predictions (numeric) to class labels
class_labels = {0: 'Benign', 1: 'Pathogenic', 2: 'VUS'}
predicted_classes = [class_labels[int(pred)] for pred in predictions]  # `pred` is the predicted class index

# Step 6: Add predictions and probabilities to the test data
test_data['Predicted_Class'] = predicted_classes
test_data['Predicted_Probability'] = predicted_probabilities

# Step 7: Output the results to Excel
test_data.to_excel("Predicted_Classes_for_Test_Data_RF_smote_prob.xlsx", index=False)

# Optionally, display the predictions with probabilities
print(test_data[['Predicted_Class', 'Predicted_Probability']].head())

Missing columns: []
Extra columns: ['RL_All_Protein_types', 'RL_NA', 'RL_Disfavoured', 'RL_Favoured', 'NA_RL', 'RL_Neutral', 'Sub_AA_ALa']
Columns in test data after alignment: Index(['SIFT_Unknown', 'SIFT_Deleterious', 'SIFT_Tolerated',
       'Polyphen_Unknown', 'Polyphen_Benign', 'Polyphen_Damaging',
       'Polyphen_Possibly_damaging', 'MutationTaster_Unknown',
       'MutationTaster_Known_deleterious',
       'MutationTaster_Probably_deleterious',
       ...
       'Sub_AA_Unknown', 'Sub_AA_Val', 'Sub_AA_delins_Unknown',
       'Sub_AA_fs_Unknown', 'Substitution_Pref_All protein types',
       'Substitution_Pref_NA', 'Substitution_Nature_Disfavoured',
       'Substitution_Nature_Favoured', 'Substitution_Nature_NA',
       'Substitution_Nature_Neutral'],
      dtype='object', length=165)


  test_data['Predicted_Class'] = predicted_classes
  test_data['Predicted_Probability'] = predicted_probabilities


  Predicted_Class  Predicted_Probability
0      Pathogenic                   0.96
1          Benign                   0.70
2          Benign                   0.44
3             VUS                   0.95
4          Benign                   0.78


In [51]:
XGB_model = joblib.load('C:/Users/HP/Downloads/TID-Work/saved_models/XGB_model.pkl')

import json
# Load the trained columns
with open('C:/Users/HP/Downloads/TID-Work/saved_models/trained_columns.json', 'r') as f:
    trained_columns = json.load(f)

# Check missing and extra columns
missing_columns = [col for col in trained_columns if col not in merged_RL2.columns]
extra_columns = [col for col in merged_RL2.columns if col not in trained_columns]

print("Missing columns:", missing_columns)
print("Extra columns:", extra_columns)

# Add missing columns with default values (e.g., 0)
for col in missing_columns:
    merged_RL2[col] = 0

# Drop extra columns
test_data = merged_RL2.drop(columns=extra_columns)

# Reorder columns to match trained model's feature order
test_data = test_data[trained_columns]

# Verify columns match before prediction
print("Columns in test data after alignment:", test_data.columns)

# Step 1: Get prediction probabilities using the trained model
pred_probabilities = XGB_model.predict_proba(test_data)

# Step 2: Make predictions for the classes (class labels directly)
predictions = XGB_model.predict(test_data)

# Step 3: If predictions are in a one-hot encoded format, use np.argmax() to get the class index
if predictions.ndim > 1:
    predictions = np.argmax(predictions, axis=1)  # Get the index of the highest probability for each sample

# Step 4: Get the probabilities for the predicted class
predicted_probabilities = [pred_probabilities[i, int(pred)] for i, pred in enumerate(predictions)]

# Step 5: Map predictions (numeric) to class labels
class_labels = {0: 'Benign', 1: 'Pathogenic', 2: 'VUS'}
predicted_classes = [class_labels[int(pred)] for pred in predictions]  # `pred` is the predicted class index

# Step 6: Add predictions and probabilities to the test data
test_data['Predicted_Class'] = predicted_classes
test_data['Predicted_Probability'] = predicted_probabilities

# Step 7: Output the results to Excel
test_data.to_excel("Predicted_Classes_for_Test_Data_XGB.xlsx", index=False)

# Optionally, display the predictions with probabilities
print(test_data[['Predicted_Class', 'Predicted_Probability']].head())

Missing columns: []
Extra columns: ['RL_All_Protein_types', 'RL_NA', 'RL_Disfavoured', 'RL_Favoured', 'NA_RL', 'RL_Neutral', 'Sub_AA_ALa']
Columns in test data after alignment: Index(['SIFT_Unknown', 'SIFT_Deleterious', 'SIFT_Tolerated',
       'Polyphen_Unknown', 'Polyphen_Benign', 'Polyphen_Damaging',
       'Polyphen_Possibly_damaging', 'MutationTaster_Unknown',
       'MutationTaster_Known_deleterious',
       'MutationTaster_Probably_deleterious',
       ...
       'Sub_AA_Unknown', 'Sub_AA_Val', 'Sub_AA_delins_Unknown',
       'Sub_AA_fs_Unknown', 'Substitution_Pref_All protein types',
       'Substitution_Pref_NA', 'Substitution_Nature_Disfavoured',
       'Substitution_Nature_Favoured', 'Substitution_Nature_NA',
       'Substitution_Nature_Neutral'],
      dtype='object', length=165)


  test_data['Predicted_Class'] = predicted_classes
  test_data['Predicted_Probability'] = predicted_probabilities


  Predicted_Class  Predicted_Probability
0      Pathogenic               0.981742
1          Benign               0.993754
2          Benign               0.995632
3             VUS               0.829984
4          Benign               0.999189


In [52]:
XGB_smote = joblib.load('C:/Users/HP/Downloads/TID-Work/saved_models/XGB_with_smote.pkl')

import json
# Load the trained columns
with open('C:/Users/HP/Downloads/TID-Work/saved_models/trained_columns.json', 'r') as f:
    trained_columns = json.load(f)

# Check missing and extra columns
missing_columns = [col for col in trained_columns if col not in merged_RL2.columns]
extra_columns = [col for col in merged_RL2.columns if col not in trained_columns]

print("Missing columns:", missing_columns)
print("Extra columns:", extra_columns)

# Add missing columns with default values (e.g., 0)
for col in missing_columns:
    merged_RL2[col] = 0

# Drop extra columns
test_data = merged_RL2.drop(columns=extra_columns)

# Reorder columns to match trained model's feature order
test_data = test_data[trained_columns]

# Verify columns match before prediction
print("Columns in test data after alignment:", test_data.columns)

# Step 1: Get prediction probabilities using the trained model
pred_probabilities = XGB_smote.predict_proba(test_data)

# Step 2: Make predictions for the classes (class labels directly)
predictions = XGB_smote.predict(test_data)

# Step 3: If predictions are in a one-hot encoded format, use np.argmax() to get the class index
if predictions.ndim > 1:
    predictions = np.argmax(predictions, axis=1)  # Get the index of the highest probability for each sample

# Step 4: Get the probabilities for the predicted class
predicted_probabilities = [pred_probabilities[i, int(pred)] for i, pred in enumerate(predictions)]

# Step 5: Map predictions (numeric) to class labels
class_labels = {0: 'Benign', 1: 'Pathogenic', 2: 'VUS'}
predicted_classes = [class_labels[int(pred)] for pred in predictions]  # `pred` is the predicted class index

# Step 6: Add predictions and probabilities to the test data
test_data['Predicted_Class'] = predicted_classes
test_data['Predicted_Probability'] = predicted_probabilities

# Step 7: Output the results to Excel
test_data.to_excel("Predicted_Classes_for_Test_Data_XGB_smote_prob.xlsx", index=False)

# Optionally, display the predictions with probabilities
print(test_data[['Predicted_Class', 'Predicted_Probability']].head())

Missing columns: []
Extra columns: ['RL_All_Protein_types', 'RL_NA', 'RL_Disfavoured', 'RL_Favoured', 'NA_RL', 'RL_Neutral', 'Sub_AA_ALa']
Columns in test data after alignment: Index(['SIFT_Unknown', 'SIFT_Deleterious', 'SIFT_Tolerated',
       'Polyphen_Unknown', 'Polyphen_Benign', 'Polyphen_Damaging',
       'Polyphen_Possibly_damaging', 'MutationTaster_Unknown',
       'MutationTaster_Known_deleterious',
       'MutationTaster_Probably_deleterious',
       ...
       'Sub_AA_Unknown', 'Sub_AA_Val', 'Sub_AA_delins_Unknown',
       'Sub_AA_fs_Unknown', 'Substitution_Pref_All protein types',
       'Substitution_Pref_NA', 'Substitution_Nature_Disfavoured',
       'Substitution_Nature_Favoured', 'Substitution_Nature_NA',
       'Substitution_Nature_Neutral'],
      dtype='object', length=165)


  test_data['Predicted_Class'] = predicted_classes
  test_data['Predicted_Probability'] = predicted_probabilities


  Predicted_Class  Predicted_Probability
0      Pathogenic               0.994192
1          Benign               0.857237
2          Benign               0.974760
3             VUS               0.963470
4          Benign               0.973427


In [53]:
LGB_model = joblib.load('C:/Users/HP/Downloads/TID-Work/saved_models/LGB_model.pkl')

import json
# Load the trained columns
with open('C:/Users/HP/Downloads/TID-Work/saved_models/trained_columns.json', 'r') as f:
    trained_columns = json.load(f)

# Check missing and extra columns
missing_columns = [col for col in trained_columns if col not in merged_RL2.columns]
extra_columns = [col for col in merged_RL2.columns if col not in trained_columns]

print("Missing columns:", missing_columns)
print("Extra columns:", extra_columns)

# Add missing columns with default values (e.g., 0)
for col in missing_columns:
    merged_RL2[col] = 0

# Drop extra columns
test_data = merged_RL2.drop(columns=extra_columns)

# Reorder columns to match trained model's feature order
test_data = test_data[trained_columns]

# Verify columns match before prediction
print("Columns in test data after alignment:", test_data.columns)

# Step 1: Get prediction probabilities using the trained model
pred_probabilities = LGB_model.predict_proba(test_data)

# Step 2: Make predictions for the classes (get the class index with highest probability)
predictions = LGB_model.predict(test_data)

# Step 3: If predictions are in a 2D array (one-hot encoded), use np.argmax() to get the class index
if predictions.ndim > 1:
    predictions = np.argmax(predictions, axis=1)  # Get the index of the highest probability for each sample

# Step 4: Get the probabilities for the predicted class
predicted_probabilities = [pred_probabilities[i, int(pred)] for i, pred in enumerate(predictions)]

# Step 5: Map predictions (numeric) to class labels
class_labels = {0: 'Benign', 1: 'Pathogenic', 2: 'VUS'}
predicted_classes = [class_labels[int(pred)] for pred in predictions]

# Step 6: Add predictions and probabilities to the test data
test_data['Predicted_Class'] = predicted_classes
test_data['Predicted_Probability'] = predicted_probabilities

# Step 7: Output the results to Excel
test_data.to_excel("Predicted_Classes_for_Test_Data_LGB.xlsx", index=False)

# Optionally, display the predictions with probabilities
print(test_data[['Predicted_Class', 'Predicted_Probability']].head())

Missing columns: []
Extra columns: ['RL_All_Protein_types', 'RL_NA', 'RL_Disfavoured', 'RL_Favoured', 'NA_RL', 'RL_Neutral', 'Sub_AA_ALa']
Columns in test data after alignment: Index(['SIFT_Unknown', 'SIFT_Deleterious', 'SIFT_Tolerated',
       'Polyphen_Unknown', 'Polyphen_Benign', 'Polyphen_Damaging',
       'Polyphen_Possibly_damaging', 'MutationTaster_Unknown',
       'MutationTaster_Known_deleterious',
       'MutationTaster_Probably_deleterious',
       ...
       'Sub_AA_Unknown', 'Sub_AA_Val', 'Sub_AA_delins_Unknown',
       'Sub_AA_fs_Unknown', 'Substitution_Pref_All protein types',
       'Substitution_Pref_NA', 'Substitution_Nature_Disfavoured',
       'Substitution_Nature_Favoured', 'Substitution_Nature_NA',
       'Substitution_Nature_Neutral'],
      dtype='object', length=165)


  test_data['Predicted_Class'] = predicted_classes
  test_data['Predicted_Probability'] = predicted_probabilities


  Predicted_Class  Predicted_Probability
0      Pathogenic               0.984495
1          Benign               0.995016
2          Benign               0.996869
3             VUS               0.874930
4          Benign               0.996667


In [54]:
LGB_smote = joblib.load('C:/Users/HP/Downloads/TID-Work/saved_models/LGB_with_smote.pkl')

import json
# Load the trained columns
with open('C:/Users/HP/Downloads/TID-Work/saved_models/trained_columns.json', 'r') as f:
    trained_columns = json.load(f)

# Check missing and extra columns
missing_columns = [col for col in trained_columns if col not in merged_RL2.columns]
extra_columns = [col for col in merged_RL2.columns if col not in trained_columns]

print("Missing columns:", missing_columns)
print("Extra columns:", extra_columns)

# Add missing columns with default values (e.g., 0)
for col in missing_columns:
    merged_RL2[col] = 0

# Drop extra columns
test_data = merged_RL2.drop(columns=extra_columns)

# Reorder columns to match trained model's feature order
test_data = test_data[trained_columns]

# Verify columns match before prediction
print("Columns in test data after alignment:", test_data.columns)

# Step 1: Get prediction probabilities using the trained model
pred_probabilities = LGB_smote.predict_proba(test_data)

# Step 2: Make predictions for the classes (get the class index with highest probability)
predictions = LGB_smote.predict(test_data)

# Step 3: If predictions are in a 2D array (one-hot encoded), use np.argmax() to get the class index
if predictions.ndim > 1:
    predictions = np.argmax(predictions, axis=1)  # Get the index of the highest probability for each sample

# Step 4: Get the probabilities for the predicted class
predicted_probabilities = [pred_probabilities[i, int(pred)] for i, pred in enumerate(predictions)]

# Step 5: Map predictions (numeric) to class labels
class_labels = {0: 'Benign', 1: 'Pathogenic', 2: 'VUS'}
predicted_classes = [class_labels[int(pred)] for pred in predictions]

# Step 6: Add predictions and probabilities to the test data
test_data['Predicted_Class'] = predicted_classes
test_data['Predicted_Probability'] = predicted_probabilities

# Step 7: Output the results to Excel
test_data.to_excel("Predicted_Classes_for_Test_Data_LGB_smote_prob.xlsx", index=False)

# Optionally, display the predictions with probabilities
print(test_data[['Predicted_Class', 'Predicted_Probability']].head())

Missing columns: []
Extra columns: ['RL_All_Protein_types', 'RL_NA', 'RL_Disfavoured', 'RL_Favoured', 'NA_RL', 'RL_Neutral', 'Sub_AA_ALa']
Columns in test data after alignment: Index(['SIFT_Unknown', 'SIFT_Deleterious', 'SIFT_Tolerated',
       'Polyphen_Unknown', 'Polyphen_Benign', 'Polyphen_Damaging',
       'Polyphen_Possibly_damaging', 'MutationTaster_Unknown',
       'MutationTaster_Known_deleterious',
       'MutationTaster_Probably_deleterious',
       ...
       'Sub_AA_Unknown', 'Sub_AA_Val', 'Sub_AA_delins_Unknown',
       'Sub_AA_fs_Unknown', 'Substitution_Pref_All protein types',
       'Substitution_Pref_NA', 'Substitution_Nature_Disfavoured',
       'Substitution_Nature_Favoured', 'Substitution_Nature_NA',
       'Substitution_Nature_Neutral'],
      dtype='object', length=165)


  test_data['Predicted_Class'] = predicted_classes
  test_data['Predicted_Probability'] = predicted_probabilities


  Predicted_Class  Predicted_Probability
0      Pathogenic               0.995881
1          Benign               0.939353
2          Benign               0.978145
3             VUS               0.901472
4          Benign               0.991896


In [55]:
CB_model = joblib.load('C:/Users/HP/Downloads/TID-Work/saved_models/CB_model.pkl')

import json
# Load the trained columns
with open('C:/Users/HP/Downloads/TID-Work/saved_models/trained_columns.json', 'r') as f:
    trained_columns = json.load(f)

# Check missing and extra columns
missing_columns = [col for col in trained_columns if col not in merged_RL2.columns]
extra_columns = [col for col in merged_RL2.columns if col not in trained_columns]

print("Missing columns:", missing_columns)
print("Extra columns:", extra_columns)

# Add missing columns with default values (e.g., 0)
for col in missing_columns:
    merged_RL2[col] = 0

# Drop extra columns
test_data = merged_RL2.drop(columns=extra_columns)

# Reorder columns to match trained model's feature order
test_data = test_data[trained_columns]

# Verify columns match before prediction
print("Columns in test data after alignment:", test_data.columns)

# Step 1: Get prediction probabilities using the trained CatBoost model
pred_probabilities = CB_model.predict_proba(test_data)

# Step 2: Make predictions for the classes (get the class index with the highest probability)
predictions = CB_model.predict(test_data)

# Step 3: If predictions are in a 2D array (one-hot encoded), use np.argmax() to get the class index
if predictions.ndim > 1:
    predictions = np.argmax(predictions, axis=1)  # Get the index of the highest probability for each sample

# Step 4: Get the probabilities for the predicted class
predicted_probabilities = [pred_probabilities[i, int(pred)] for i, pred in enumerate(predictions)]

# Step 5: Map predictions (numeric) to class labels
class_labels = {0: 'Benign', 1: 'Pathogenic', 2: 'VUS'}
predicted_classes = [class_labels[int(pred)] for pred in predictions]

# Step 6: Add predictions and probabilities to the test data
test_data['Predicted_Class'] = predicted_classes
test_data['Predicted_Probability'] = predicted_probabilities

# Step 7: Output the results to Excel
test_data.to_excel("Predicted_Classes_for_Test_Data_CB.xlsx", index=False)

# Optionally, display the predictions with probabilities
print(test_data[['Predicted_Class', 'Predicted_Probability']].head())

Missing columns: []
Extra columns: ['RL_All_Protein_types', 'RL_NA', 'RL_Disfavoured', 'RL_Favoured', 'NA_RL', 'RL_Neutral', 'Sub_AA_ALa']
Columns in test data after alignment: Index(['SIFT_Unknown', 'SIFT_Deleterious', 'SIFT_Tolerated',
       'Polyphen_Unknown', 'Polyphen_Benign', 'Polyphen_Damaging',
       'Polyphen_Possibly_damaging', 'MutationTaster_Unknown',
       'MutationTaster_Known_deleterious',
       'MutationTaster_Probably_deleterious',
       ...
       'Sub_AA_Unknown', 'Sub_AA_Val', 'Sub_AA_delins_Unknown',
       'Sub_AA_fs_Unknown', 'Substitution_Pref_All protein types',
       'Substitution_Pref_NA', 'Substitution_Nature_Disfavoured',
       'Substitution_Nature_Favoured', 'Substitution_Nature_NA',
       'Substitution_Nature_Neutral'],
      dtype='object', length=165)


  test_data['Predicted_Class'] = predicted_classes
  test_data['Predicted_Probability'] = predicted_probabilities


  Predicted_Class  Predicted_Probability
0      Pathogenic               0.970503
1          Benign               0.993654
2          Benign               0.977910
3          Benign               0.202583
4          Benign               0.998388


In [56]:
CB_smote = joblib.load('C:/Users/HP/Downloads/TID-Work/saved_models/CB_with_smote.pkl')

import json
# Load the trained columns
with open('C:/Users/HP/Downloads/TID-Work/saved_models/trained_columns.json', 'r') as f:
    trained_columns = json.load(f)

# Check missing and extra columns
missing_columns = [col for col in trained_columns if col not in merged_RL2.columns]
extra_columns = [col for col in merged_RL2.columns if col not in trained_columns]

print("Missing columns:", missing_columns)
print("Extra columns:", extra_columns)

# Add missing columns with default values (e.g., 0)
for col in missing_columns:
    merged_RL2[col] = 0

# Drop extra columns
test_data = merged_RL2.drop(columns=extra_columns)

# Reorder columns to match trained model's feature order
test_data = test_data[trained_columns]

# Verify columns match before prediction
print("Columns in test data after alignment:", test_data.columns)

# Step 1: Get prediction probabilities using the trained CatBoost model
pred_probabilities = CB_smote.predict_proba(test_data)

# Step 2: Make predictions for the classes (get the class index with the highest probability)
predictions = CB_smote.predict(test_data)

# Step 3: If predictions are in a 2D array (one-hot encoded), use np.argmax() to get the class index
if predictions.ndim > 1:
    predictions = np.argmax(predictions, axis=1)  # Get the index of the highest probability for each sample

# Step 4: Get the probabilities for the predicted class
predicted_probabilities = [pred_probabilities[i, int(pred)] for i, pred in enumerate(predictions)]

# Step 5: Map predictions (numeric) to class labels
class_labels = {0: 'Benign', 1: 'Pathogenic', 2: 'VUS'}
predicted_classes = [class_labels[int(pred)] for pred in predictions]

# Step 6: Add predictions and probabilities to the test data
test_data['Predicted_Class'] = predicted_classes
test_data['Predicted_Probability'] = predicted_probabilities

# Step 7: Output the results to Excel
test_data.to_excel("Predicted_Classes_for_Test_Data_CAT_smote_prob.xlsx", index=False)

# Optionally, display the predictions with probabilities
print(test_data[['Predicted_Class', 'Predicted_Probability']].head())

Missing columns: []
Extra columns: ['RL_All_Protein_types', 'RL_NA', 'RL_Disfavoured', 'RL_Favoured', 'NA_RL', 'RL_Neutral', 'Sub_AA_ALa']
Columns in test data after alignment: Index(['SIFT_Unknown', 'SIFT_Deleterious', 'SIFT_Tolerated',
       'Polyphen_Unknown', 'Polyphen_Benign', 'Polyphen_Damaging',
       'Polyphen_Possibly_damaging', 'MutationTaster_Unknown',
       'MutationTaster_Known_deleterious',
       'MutationTaster_Probably_deleterious',
       ...
       'Sub_AA_Unknown', 'Sub_AA_Val', 'Sub_AA_delins_Unknown',
       'Sub_AA_fs_Unknown', 'Substitution_Pref_All protein types',
       'Substitution_Pref_NA', 'Substitution_Nature_Disfavoured',
       'Substitution_Nature_Favoured', 'Substitution_Nature_NA',
       'Substitution_Nature_Neutral'],
      dtype='object', length=165)


  test_data['Predicted_Class'] = predicted_classes
  test_data['Predicted_Probability'] = predicted_probabilities


  Predicted_Class  Predicted_Probability
0      Pathogenic               0.999884
1          Benign               0.723974
2          Benign               0.807591
3             VUS               0.969499
4          Benign               0.986065
