In [1]:
import joblib
import re
import glob
import numpy as np
import pandas as pd
import json

In [3]:
# Load the first Excel file ending with 'FENG.xlsx'
feng_files = glob.glob("*FENG.xlsx")
if not feng_files:
    raise FileNotFoundError("No file ending with 'FENG.xlsx' found in the current directory.")
df = pd.read_excel(feng_files[0])
print(f"Loaded file: {feng_files[0]}")

# Clean column names
df.columns = df.columns.str.strip().str.replace(" ", "_", regex=False)

# Specify columns to keep
columns_to_keep = [
    "CHROM_x", "POS_x", "End_x", "REF_x", "ALT_x", "Ref.Gene", "Func.ensGene", "ExonicFunc.ensGene",
    "clinvar:_Clinvar", "InterVar_automated", "SIFT_pred", "Polyphen2_HVAR_pred", "MutationTaster_pred",
    "ensemble_value", "AAChange.ensGene"
]

# Filter the dataframe
df1 = df[columns_to_keep]

# Extract the last part of AAChange.ensGene based on ensemble_value
def extract_last_instance(row):
    if pd.isna(row["AAChange.ensGene"]) or row["AAChange.ensGene"] == ".":
        return "Unknown"
    matches = [part for part in row["AAChange.ensGene"].split(",") if row["ensemble_value"] in part]
    return matches[0].split(":")[-1] if matches else "Unknown"

df1["AA_Change"] = df1.apply(extract_last_instance, axis=1)

# Drop unnecessary columns
df1.drop(columns=["ensemble_value", "AAChange.ensGene"], inplace=True)

# Extract Main_AA and Sub_AA dynamically
def extract_aa_change(aa_change):
    if aa_change == "Unknown" or pd.isna(aa_change) or aa_change == ".":
        return "Unknown", "Unknown"
    match = re.match(r"p\.([A-Za-z]+)(\d+)([A-Za-z]+|fs|X|del|ins|delins)?", aa_change)
    if match:
        main_aa = match.group(1)
        sub_aa = match.group(3) if match.group(3) else "Unknown"
        return main_aa, sub_aa
    return "Unknown", "Unknown"

df1[["Main_AA", "Sub_AA"]] = df1["AA_Change"].apply(lambda x: pd.Series(extract_aa_change(x)))

# Replace specific terms in Main_AA and Sub_AA
def replace_terms(aa):
    replacements = {"Ter": "Ter_Unknown", "del": "Del_Unknown", "X": "Unknown", "fs": "fs_Unknown"}
    return replacements.get(aa, aa)

df1["Main_AA"] = df1["Main_AA"].apply(replace_terms)
df1["Sub_AA"] = df1["Sub_AA"].apply(replace_terms)

# Merge the dataframes on 'Main_AA' and 'Sub_AA' columns using a left join
file1 = pd.read_excel("russel_lab_AA_db.xlsx")
merged_df = pd.merge(df1, file1, on=['Main_AA', 'Sub_AA'], how='left', suffixes=('_x', '_y'))

# Apply custom transformations for 'Substitution_Type', 'Score', and 'Substitution_Pref'
def handle_unknown_aa(row):
    if 'Unknown' in str(row['Main_AA']).strip() or 'Unknown' in str(row['Sub_AA']).strip():
        row['Substitution_Type'] = 'NA'
        row['Score'] = 'NA'
    return row

# Add the 'Substitution_Pref' column based on the prefix in 'Substitution_Type'
def get_substitution_pref(sub_type):
    if pd.isnull(sub_type) or sub_type == 'NA':
        return 'NA'
    if sub_type.startswith('APT'):
        return 'All protein types'  # Only keep APT substitutions
    return 'NA'  # Default to 'NA' for any non-APT type

# Add the 'Substitution_Nature' column based on the suffix in 'Substitution_Type'
def get_substitution_nature(sub_type):
    if pd.isnull(sub_type) or sub_type == 'NA':
        return 'NA'
    if sub_type.endswith('DF'):
        return 'Disfavoured'
    elif sub_type.endswith('F'):
        return 'Favoured'
    elif sub_type.endswith('N'):
        return 'Neutral'
    else:
        return 'NA'

merged_df = merged_df.apply(handle_unknown_aa, axis=1)
merged_df['Substitution_Pref'] = merged_df['Substitution_Type'].apply(get_substitution_pref)
merged_df['Substitution_Nature'] = merged_df['Substitution_Type'].apply(get_substitution_nature)

# Helper function to strip spaces
def strip_and_replace(df, column, replace_dict=None):
    df[column] = df[column].astype(str).str.strip()
    if replace_dict:
        df[column] = df[column].replace(replace_dict)

# Strip extra spaces and replace values
replace_dict_clinvar = {
    'clinvar: UNK': 'VUS',
    'clinvar: other': 'VUS',
    'clinvar: Conflicting_interpretations_of_pathogenicity': 'VUS',
    'clinvar: Conflicting_interpretations_of_pathogenicity,_other': 'VUS',
    'clinvar: Conflicting_interpretations_of_pathogenicity,_association': 'VUS',
    'clinvar: Conflicting_interpretations_of_pathogenicity,_risk_factor': 'VUS',
    'clinvar: Conflicting_interpretations_of_pathogenicity,_other,_risk_factor': 'VUS',
    'clinvar: Conflicting_interpretations_of_pathogenicity,_association,_other,_risk_factor': 'VUS',
    'clinvar: Uncertain_significance|drug_response': 'VUS',
    'clinvar: Uncertain_significance': 'VUS',
    'clinvar: Pathogenic/Likely_pathogenic': 'Pathogenic',
    'clinvar: Pathogenic,_other,_risk_factor': 'Pathogenic',
    'clinvar: Pathogenic/Likely_pathogenic,_risk_factor': 'Pathogenic',
    'clinvar: Pathogenic,_risk_factor': 'Pathogenic',
    'clinvar: Pathogenic/Likely_pathogenic,_other': 'Pathogenic',
    'clinvar: drug_response': 'Pathogenic',
    'clinvar: Pathogenic': 'Pathogenic',
    'clinvar: Likely_pathogenic': 'Pathogenic',
    'clinvar: Likely_pathogenic,_risk_factor': 'Pathogenic',
    'clinvar: Pathogenic|drug_response|other': 'Pathogenic',
    'clinvar: Pathogenic/Likely_pathogenic|drug_response': 'Pathogenic',
    'clinvar: Pathogenic/Likely_pathogenic|other': 'Pathogenic',
    'clinvar: Pathogenic/Likely_pathogenic|association|other': 'Pathogenic',
    'clinvar: Likely_pathogenic|risk_factor': 'Pathogenic',
    'clinvar: Benign': 'Benign',
    'clinvar: Likely_benign': 'Benign',
    'clinvar: not_provided': 'VUS',
    'clinvar: Benign/Likely_benign': 'Benign',
}

replace_dict_intervar = {
    'PATHOGENIC': 'Pathogenic',
    'LIKELY_PATHOGENIC': 'Pathogenic',
    'LIKELY_BENIGN': 'Benign',
    'Likely_benign': 'Benign',
    'BENIGN': 'Benign',
    'UNCERTAIN_SIGNIFICANCE': 'VUS',
    'Uncertain_significance': 'VUS',
    '.': 'VUS'
}

# Apply stripping and replacement
strip_and_replace(merged_df, 'clinvar:_Clinvar', replace_dict_clinvar)
strip_and_replace(merged_df, 'InterVar_automated', replace_dict_intervar)

# Replace spaces in column names with underscores
merged_df.columns = merged_df.columns.str.replace(' ', '_')

# Apply one-hot encoding and column renaming
def encode_and_rename(df, column, rename_dict):
    encoded = pd.get_dummies(df[column])
    encoded.rename(columns=rename_dict, inplace=True)
    return pd.concat([df, encoded], axis='columns').drop([column], axis='columns')

# Example for SIFT, Polyphen, MutationTaster, etc.
sift_rename = {'D': 'SIFT_Deleterious', 'T': 'SIFT_Tolerated', '.': 'SIFT_Unknown'}
merged_df = encode_and_rename(merged_df, 'SIFT_pred', sift_rename)

polyphen_rename = {'D': 'Polyphen_Damaging', 'B': 'Polyphen_Benign', '.': 'Polyphen_Unknown', 'P': 'Polyphen_Possibly_damaging'}
merged_df = encode_and_rename(merged_df, 'Polyphen2_HVAR_pred', polyphen_rename)

mt_rename = {'A': 'MutationTaster_Known_deleterious', 'N': 'MutationTaster_Probably_harmless', 'P': 'MutationTaster_Knowntobeharmless', '.': 'MutationTaster_Unknown', 'D': 'MutationTaster_Probably_deleterious'}
merged_df = encode_and_rename(merged_df, 'MutationTaster_pred', mt_rename)

clinvar_rename = {'Benign': 'Clinvar_Benign', 'Pathogenic': 'Clinvar_Pathogenic', 'VUS': 'Clinvar_VUS'}
merged_df = encode_and_rename(merged_df, 'clinvar:_Clinvar', clinvar_rename)

intervar_rename = {'Benign': 'Intervar_Benign', 'Pathogenic': 'Intervar_Pathogenic', 'VUS': 'Intervar_VUS'}
merged_df = encode_and_rename(merged_df, 'InterVar_automated', intervar_rename)

# One-hot encode and clean 'Substitution_Pref' and 'Substitution_Nature'
sub_pref_rename = {'All protein types': 'Substitution_Pref_All_protein_types', 'NA': 'Substitution_Pref_NA'}
merged_df = encode_and_rename(merged_df, 'Substitution_Pref', sub_pref_rename)

sub_nature_rename = {'Disfavoured': 'Substitution_Nature_Disfavoured', 'Favoured': 'Substitution_Nature_Favoured', 'NA': 'Substitution_Nature_NA', 'Neutral': 'Substitution_Nature_Neutral'}
merged_df = encode_and_rename(merged_df, 'Substitution_Nature', sub_nature_rename)

# Normalize the 'ExonicFunc.ensGene' column to lowercase
merged_df['ExonicFunc.ensGene'] = merged_df['ExonicFunc.ensGene'].str.lower()

# One-hot encoding for categorical columns
merged_df = pd.get_dummies(merged_df, columns=['Func.ensGene', 'ExonicFunc.ensGene', 'Ref.Gene', 'Main_AA', 'Sub_AA'])

# Convert boolean columns to integer (0, 1)
bool_cols = merged_df.select_dtypes(include='bool').columns
merged_df[bool_cols] = merged_df[bool_cols].astype(int)

# Drop unnecessary columns
merged_df = merged_df.drop(['CHROM_x', 'POS_x', 'End_x', 'REF_x', 'ALT_x', 'AA_Change', 'Substitution_Type', 'Score'], axis='columns')

# Load the Logistic regression trained model and columns
model_path = 'C:/Users/HP/Downloads/Variant-prioritization/saved_models/T1/Target-2/LR_model_smote.pkl'
columns_path = 'C:/Users/HP/Downloads/Variant-prioritization/saved_models/T1/Target-2/trained_columns.json'

try:
    model_LR_smote = joblib.load(model_path)
    with open(columns_path, 'r') as f:
        trained_columns = json.load(f)
except Exception as e:
    print(f"Error loading model or columns: {e}")

# Ensure columns align with the trained model
missing_columns = [col for col in trained_columns if col not in merged_df.columns]
extra_columns = [col for col in merged_df.columns if col not in trained_columns]

# Add missing columns with default values (0) and drop extra columns
for col in missing_columns:
    merged_df[col] = 0
test_data = merged_df.drop(columns=extra_columns)
test_data = test_data[trained_columns]

# Predict using the model
pred_probabilities = model_LR_smote.predict_proba(test_data)
predictions = model_LR_smote.predict(test_data)

# Step 3: If predictions are one-hot encoded, use np.argmax() to get the class index
if predictions.ndim > 1:
    predictions = np.argmax(predictions, axis=1)  # Get the index of the predicted class (highest probability)

# Step 4: Get the probabilities for the predicted class
predicted_probabilities = [pred_probabilities[i, int(pred)] for i, pred in enumerate(predictions)]

# Step 5: Map predictions (numeric) to class labels
class_labels = {0: 'Benign', 1: 'Pathogenic', 2: 'VUS'}
predicted_classes = [class_labels[int(pred)] for pred in predictions]  # pred is the predicted class index

# Step 6: Add predictions and probabilities to the test data
test_data['Predicted_Class'] = predicted_classes
test_data['Predicted_Probability'] = predicted_probabilities

df['Predicted_Class'] = test_data['Predicted_Class']
df['Predicted_Probability'] = test_data['Predicted_Probability']

# Define the custom order for the classes
class_order = ['Pathogenic', 'VUS', 'Benign']

# Create a new column 'Class_Order' to assign a numeric value based on the custom order
df['Class_Order'] = df['Predicted_Class'].apply(lambda x: class_order.index(x))

# Sort the dataframe by 'Class_Order' and then by 'Predicted_Probability' in descending order
df_sorted = df.sort_values(by=['Class_Order', 'Predicted_Probability'], ascending=[True, False])

# Drop the 'Class_Order' column if you don't need it in the final output
df_sorted = df_sorted.drop(columns=['Class_Order'])

# Get the input file name without extension
input_file_name = feng_files[0]

# Modify the file name to append '_Predicted' before the extension
output_file_name = input_file_name.replace(".xlsx", "_Predicted.xlsx")

# Save the sorted dataframe with the modified file name
df_sorted.to_excel(output_file_name, index=False)
print(f"Sorted data saved to: {output_file_name}")

# Optionally, display the sorted dataframe
#print(df_sorted[['Predicted_Class', 'Predicted_Probability']].head())

Loaded file: XJWAA-B-cf-FEV2F2both-S1_TID_FENG.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["AA_Change"] = df1.apply(extract_last_instance, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop(columns=["ensemble_value", "AAChange.ensGene"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[["Main_AA", "Sub_AA"]] = df1["AA_Change"].apply(lambda x: pd.Series(extract_aa_change(x)))
A value is trying to be set on a cop

Sorted data saved to: XJWAA-B-cf-FEV2F2both-S1_TID_FENG_Predicted.xlsx
