In [3]:
import os
import pandas as pd

# Specify the folder containing your Excel files
data_folder = "Data"

# Define the column names
col_names = ['Case ID', 'Suspect Product Active Ingredients', 'Reason for Use', 'Reactions', 'Serious', 'Outcomes', 'Sex', 'Patient Age', 'Patient Weight']

# Initialize an empty DataFrame to store the combined data
combined_data = pd.DataFrame(columns=col_names)

# Iterate over all Excel files in the folder
for file in os.listdir(data_folder):
    if file.endswith(".xlsx"):  # Check if the file is an Excel file
        file_path = os.path.join(data_folder, file)
        # Read the Excel file and add it to the combined DataFrame
        data = pd.read_excel(file_path, usecols=col_names)  # Load only the specified columns
        combined_data = pd.concat([combined_data, data], ignore_index=True)

# Display the combined dataset
print(combined_data)

       Case ID                 Suspect Product Active Ingredients  \
0     24791405                         Semaglutide;Insulin Aspart   
1     24791400                                     Insulin Aspart   
2     24787265  Dextrose Monohydrate\Sodium Chloride;Fluticaso...   
3     24786424        Insulin Degludec\Liraglutide;Insulin Aspart   
4     24786413                       Insulin Human;Insulin Aspart   
...        ...                                                ...   
7964   4610400                                  Insulin Beef/Pork   
7965   4506904  Insulin Beef/Pork;Insulin Pork\Insulin Purifie...   
7966   4457831                                  Insulin Beef/Pork   
7967   4456701                                  Insulin Beef/Pork   
7968   4441093                                  Insulin Beef/Pork   

                                         Reason for Use  \
0                                     Diabetes Mellitus   
1                                     Diabetes Mellit

In [4]:
combined_data.head()

Unnamed: 0,Case ID,Suspect Product Active Ingredients,Reason for Use,Reactions,Serious,Outcomes,Sex,Patient Age,Patient Weight
0,24791405,Semaglutide;Insulin Aspart,Diabetes Mellitus,Constipation;Abdominal Pain Upper;Syncope;Asth...,Serious,Other Outcomes,Male,72 YR,Not Specified
1,24791400,Insulin Aspart,Diabetes Mellitus,Oedema Peripheral;Face Oedema,Serious,Hospitalized,Female,76 YR,Not Specified
2,24787265,Dextrose Monohydrate\Sodium Chloride;Fluticaso...,Anaemia;Analgesic Therapy;Antacid Therapy;Anti...,Multiple Organ Dysfunction Syndrome;Abdominal ...,Serious,Life Threatening;Disabled;Other Outcomes;Died;...,Male,80 YR,69 KG
3,24786424,Insulin Degludec\Liraglutide;Insulin Aspart,Product Used For Unknown Indication,Product Storage Error;Hyperglycaemia;Malaise;D...,Serious,Other Outcomes,Female,1053 MTH,Not Specified
4,24786413,Insulin Human;Insulin Aspart,Diabetes Mellitus;Neoplasm Malignant,Tumour Excision;Spinal Subdural Haemorrhage;Bl...,Serious,Other Outcomes;Hospitalized,Male,831 MTH,Not Specified


In [5]:
selected_columns = ['Case ID', 'Suspect Product Active Ingredients', 'Reactions',  'Serious', 'Sex', 'Patient Age', 'Patient Weight']

#df = data[selected_columns]
df = combined_data[selected_columns]

df.head()
print(df)

       Case ID                 Suspect Product Active Ingredients  \
0     24791405                         Semaglutide;Insulin Aspart   
1     24791400                                     Insulin Aspart   
2     24787265  Dextrose Monohydrate\Sodium Chloride;Fluticaso...   
3     24786424        Insulin Degludec\Liraglutide;Insulin Aspart   
4     24786413                       Insulin Human;Insulin Aspart   
...        ...                                                ...   
7964   4610400                                  Insulin Beef/Pork   
7965   4506904  Insulin Beef/Pork;Insulin Pork\Insulin Purifie...   
7966   4457831                                  Insulin Beef/Pork   
7967   4456701                                  Insulin Beef/Pork   
7968   4441093                                  Insulin Beef/Pork   

                                              Reactions  Serious     Sex  \
0     Constipation;Abdominal Pain Upper;Syncope;Asth...  Serious    Male   
1                  

In [6]:
df['Suspect Product Active Ingredients'] = df['Suspect Product Active Ingredients'].str.split(';')
df_split_temp = df.explode('Suspect Product Active Ingredients', ignore_index=True)

df_split_temp['Reactions'] = df_split_temp['Reactions'].str.split(';')
df_split = df_split_temp.explode('Reactions', ignore_index = True)

df_split.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Suspect Product Active Ingredients'] = df['Suspect Product Active Ingredients'].str.split(';')


Unnamed: 0,Case ID,Suspect Product Active Ingredients,Reactions,Serious,Sex,Patient Age,Patient Weight
0,24791405,Semaglutide,Constipation,Serious,Male,72 YR,Not Specified
1,24791405,Semaglutide,Abdominal Pain Upper,Serious,Male,72 YR,Not Specified
2,24791405,Semaglutide,Syncope,Serious,Male,72 YR,Not Specified
3,24791405,Semaglutide,Asthenia,Serious,Male,72 YR,Not Specified
4,24791405,Semaglutide,Visual Impairment,Serious,Male,72 YR,Not Specified


In [7]:
count = df_split['Serious'].value_counts()

print(count)

print(df_split.dtypes)

Serious
Serious        710007
Non-Serious      8744
Name: count, dtype: int64
Case ID                               object
Suspect Product Active Ingredients    object
Reactions                             object
Serious                               object
Sex                                   object
Patient Age                           object
Patient Weight                        object
dtype: object


In [8]:
df_multi = pd.get_dummies(df_split, columns = ['Suspect Product Active Ingredients', 'Reactions'], prefix=['Product', 'Reaction'], prefix_sep='_')
reactions = [col for col in df_multi.columns if col.startswith("Reaction_")]
df_reaction = df_multi[reactions]
df_multi = df_multi.groupby('Case ID').max().reset_index()

columns_to_exclude = ['Case ID', 'Suspect Product Active Ingredients', 'Reactions', 'Serious', 'Sex', 'Patient Age', 'Patient Weight']

columns_to_convert = [col for col in df_multi.columns if col.startswith("Product_") or col.startswith("Reaction_")] #not in columns_to_exclude]

df_multi[columns_to_convert] = df_multi[columns_to_convert].astype(int)

df_final = df_multi
print(df_multi.columns)
print(df_final)

df_final.head()

Index(['Case ID', 'Serious', 'Sex', 'Patient Age', 'Patient Weight',
       'Product_.Alpha.-Glucose-1-Phosphate Disodium Tetrahydrate',
       'Product_.Alpha.-Tocopherol', 'Product_.Alpha.-Tocopherol Acetate',
       'Product_.Alpha.-Tocopherol Acetate, D-',
       'Product_.Alpha.-Tocopherol Acetate, Dl-',
       ...
       'Reaction_Wound Infection', 'Reaction_Wrist Fracture',
       'Reaction_Wrong Device Used', 'Reaction_Wrong Dose',
       'Reaction_Wrong Patient Received Product',
       'Reaction_Wrong Product Administered', 'Reaction_Wrong Schedule',
       'Reaction_Wrong Technique In Device Usage Process',
       'Reaction_Wrong Technique In Product Usage Process',
       'Reaction_Yellow Skin'],
      dtype='object', length=3219)
       Case ID  Serious     Sex Patient Age Patient Weight  \
0      3026787  Serious    Male       77 YR        65.6 KG   
1      3027125  Serious  Female       78 YR  Not Specified   
2      3036071  Serious  Female       69 YR  Not Specified   

Unnamed: 0,Case ID,Serious,Sex,Patient Age,Patient Weight,Product_.Alpha.-Glucose-1-Phosphate Disodium Tetrahydrate,Product_.Alpha.-Tocopherol,Product_.Alpha.-Tocopherol Acetate,"Product_.Alpha.-Tocopherol Acetate, D-","Product_.Alpha.-Tocopherol Acetate, Dl-",...,Reaction_Wound Infection,Reaction_Wrist Fracture,Reaction_Wrong Device Used,Reaction_Wrong Dose,Reaction_Wrong Patient Received Product,Reaction_Wrong Product Administered,Reaction_Wrong Schedule,Reaction_Wrong Technique In Device Usage Process,Reaction_Wrong Technique In Product Usage Process,Reaction_Yellow Skin
0,3026787,Serious,Male,77 YR,65.6 KG,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3027125,Serious,Female,78 YR,Not Specified,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3036071,Serious,Female,69 YR,Not Specified,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3117138,Serious,Male,79 YR,Not Specified,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3126962,Serious,Male,76 YR,Not Specified,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
print(df_reaction.columns)

Index(['Reaction_Abdominal Abscess', 'Reaction_Abdominal Discomfort',
       'Reaction_Abdominal Distension', 'Reaction_Abdominal Hernia',
       'Reaction_Abdominal Hernia Obstructive', 'Reaction_Abdominal Infection',
       'Reaction_Abdominal Injury', 'Reaction_Abdominal Mass',
       'Reaction_Abdominal Neoplasm', 'Reaction_Abdominal Operation',
       ...
       'Reaction_Wound Infection', 'Reaction_Wrist Fracture',
       'Reaction_Wrong Device Used', 'Reaction_Wrong Dose',
       'Reaction_Wrong Patient Received Product',
       'Reaction_Wrong Product Administered', 'Reaction_Wrong Schedule',
       'Reaction_Wrong Technique In Device Usage Process',
       'Reaction_Wrong Technique In Product Usage Process',
       'Reaction_Yellow Skin'],
      dtype='object', length=2166)


In [10]:
df_encoded = df_multi.copy()
print(df_encoded.columns)
#print(df_split['Patient Age'])
df_encoded['Patient Age'] = df_encoded['Patient Age'].astype(str)
df_encoded['Patient Age'] = df_encoded['Patient Age'].str.replace(r'\D+', '', regex=True)
df_encoded['Patient Age'] = pd.to_numeric(df_encoded['Patient Age'], errors='coerce')  # Converts to numeric, sets invalid values to NaN

df_encoded['Patient Weight'] = df_encoded['Patient Weight'].replace('Not Specified', "0 KG")
df_encoded['Patient Weight'] = df_encoded['Patient Weight'].astype(str)
df_encoded['Patient Weight'] = df_encoded['Patient Weight'].str.replace(r'[^\d.]', '', regex=True)
df_encoded['Patient Weight'] = pd.to_numeric(df_encoded['Patient Weight'], errors='coerce')

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

#df_encoded['Suspect Product Active Ingredients'] = label_encoder.fit_transform(df_encoded['Suspect Product Active Ingredients'])
df_encoded['Sex'] = label_encoder.fit_transform(df_encoded['Sex'])
df_encoded['Serious'] = label_encoder.fit_transform(df_encoded['Serious'])
print(df_encoded.isnull().sum())
df_encoded.dropna(inplace=True)
print(df_encoded.isnull().sum())
df_encoded.head()

Index(['Case ID', 'Serious', 'Sex', 'Patient Age', 'Patient Weight',
       'Product_.Alpha.-Glucose-1-Phosphate Disodium Tetrahydrate',
       'Product_.Alpha.-Tocopherol', 'Product_.Alpha.-Tocopherol Acetate',
       'Product_.Alpha.-Tocopherol Acetate, D-',
       'Product_.Alpha.-Tocopherol Acetate, Dl-',
       ...
       'Reaction_Wound Infection', 'Reaction_Wrist Fracture',
       'Reaction_Wrong Device Used', 'Reaction_Wrong Dose',
       'Reaction_Wrong Patient Received Product',
       'Reaction_Wrong Product Administered', 'Reaction_Wrong Schedule',
       'Reaction_Wrong Technique In Device Usage Process',
       'Reaction_Wrong Technique In Product Usage Process',
       'Reaction_Yellow Skin'],
      dtype='object', length=3219)
Case ID                                               0
Serious                                               0
Sex                                                   0
Patient Age                                          17
Patient Weight         

Unnamed: 0,Case ID,Serious,Sex,Patient Age,Patient Weight,Product_.Alpha.-Glucose-1-Phosphate Disodium Tetrahydrate,Product_.Alpha.-Tocopherol,Product_.Alpha.-Tocopherol Acetate,"Product_.Alpha.-Tocopherol Acetate, D-","Product_.Alpha.-Tocopherol Acetate, Dl-",...,Reaction_Wound Infection,Reaction_Wrist Fracture,Reaction_Wrong Device Used,Reaction_Wrong Dose,Reaction_Wrong Patient Received Product,Reaction_Wrong Product Administered,Reaction_Wrong Schedule,Reaction_Wrong Technique In Device Usage Process,Reaction_Wrong Technique In Product Usage Process,Reaction_Yellow Skin
0,3026787,1,1,77.0,65.6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3027125,1,0,78.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3036071,1,0,69.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3117138,1,1,79.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3126962,1,1,76.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
count = df_encoded['Serious'].value_counts()

print(count)

Serious
1    5987
0    1854
Name: count, dtype: int64


In [12]:
# Sum columns that start with "Reaction_"
reaction_columns = df_encoded.filter(like="Reaction_", axis=1)
column_sums = reaction_columns.sum(axis=0)

print(len(reaction_columns.columns))
# Set the threshold
threshold = 50 #300

# Filter column names with sums greater than the threshold
columns_above_threshold = column_sums[column_sums > threshold].index.tolist()

print(columns_above_threshold)
print(len(columns_above_threshold))

filtered_df = df_encoded[df_encoded[columns_above_threshold].sum(axis=1) > 0]

# Print the number of rows before and after filtering
print(f"Number of rows before filtering: {df_encoded.shape[0]}")
print(f"Number of rows after removing all-zero rows: {filtered_df.shape[0]}")

2166
['Reaction_Abdominal Distension', 'Reaction_Abdominal Pain', 'Reaction_Abdominal Pain Upper', 'Reaction_Acute Kidney Injury', 'Reaction_Anaemia', 'Reaction_Anxiety', 'Reaction_Appendicitis', 'Reaction_Appendicolith', 'Reaction_Arthralgia', 'Reaction_Ascites', 'Reaction_Asthenia', 'Reaction_Asthma', 'Reaction_Atrial Fibrillation', 'Reaction_Bacterial Infection', 'Reaction_Blindness', 'Reaction_Blood Cholesterol Increased', 'Reaction_Blood Glucose Abnormal', 'Reaction_Blood Glucose Decreased', 'Reaction_Blood Glucose Fluctuation', 'Reaction_Blood Glucose Increased', 'Reaction_Blood Phosphorus Increased', 'Reaction_Blood Pressure Increased', 'Reaction_Blood Uric Acid Increased', 'Reaction_Cardiac Failure', 'Reaction_Cardiac Failure Congestive', 'Reaction_Cardiogenic Shock', 'Reaction_Cataract', 'Reaction_Cerebrovascular Accident', 'Reaction_Chest Pain', 'Reaction_Chronic Obstructive Pulmonary Disease', 'Reaction_Condition Aggravated', 'Reaction_Confusional State', 'Reaction_Constipat

Assign Features/Predictions for Models

In [13]:
cols_to_exclude = ['Case ID', 'Suspect Product Active Ingredients', 'Serious' ,'Reactions']

feature_cols = [col for col in df_encoded.columns if col not in cols_to_exclude and not col.startswith("Reaction_")]

print(feature_cols)
print(len(feature_cols))

X_serious = df_encoded[feature_cols]
Y_serious = df_encoded.Serious

# Reaction Predictor

feat_cols = ["Serious"]
feat_cols.extend(feature_cols)

print(feat_cols)
print(len(feat_cols))

predict_cols = []#["Serious"]
reaction_cols = columns_above_threshold #[col for col in df_encoded.columns if col.startswith("Reaction_")]
predict_cols.extend(reaction_cols)

print(predict_cols)
print(len(predict_cols))

df_filtered = df_encoded[df_encoded[predict_cols].sum(axis=1) > 0]

X_reaction = df_filtered[feat_cols]
Y_reaction = df_filtered[predict_cols]

['Sex', 'Patient Age', 'Patient Weight', 'Product_.Alpha.-Glucose-1-Phosphate Disodium Tetrahydrate', 'Product_.Alpha.-Tocopherol', 'Product_.Alpha.-Tocopherol Acetate', 'Product_.Alpha.-Tocopherol Acetate, D-', 'Product_.Alpha.-Tocopherol Acetate, Dl-', 'Product_.Alpha.-Tocopherol Succinate, D-', 'Product_.Alpha.-Tocopherol Succinate, Dl-', 'Product_.Alpha.-Tocopherol, D-', 'Product_.Alpha.-Tocopherol, Dl-', 'Product_.Alpha.-Tocopherol, Dl-\\Fish Oil', 'Product_.Alpha.-Tocopherol\\Allantoin', 'Product_.Beta.-Carotene', 'Product_Abiraterone Acetate', 'Product_Acarbose', 'Product_Acebutolol', 'Product_Acebutolol Hydrochloride', 'Product_Acenocoumarol', 'Product_Acetaminophen', 'Product_Acetaminophen\\Belladonna Leaf\\Caffeine\\Opium', 'Product_Acetaminophen\\Chlorpheniramine\\Dextromethorphan\\Diphenhydramine\\Doxylamine\\Pseudoephedrine', 'Product_Acetaminophen\\Chlorpheniramine\\Dextromethorphan\\Pseudoephedrine Hydrochloride', 'Product_Acetaminophen\\Codeine Phosphate', 'Product_Acet

In [None]:
X_serious.head()


Unnamed: 0,Reaction_Abdominal Distension,Reaction_Abdominal Pain,Reaction_Abdominal Pain Upper,Reaction_Acute Kidney Injury,Reaction_Anaemia,Reaction_Anxiety,Reaction_Appendicitis,Reaction_Appendicolith,Reaction_Arthralgia,Reaction_Ascites,...,Reaction_Vasculitis,Reaction_Ventricular Fibrillation,Reaction_Vision Blurred,Reaction_Visual Impairment,Reaction_Vomiting,Reaction_Weight Decreased,Reaction_Weight Increased,Reaction_Wheezing,Reaction_Wrong Product Administered,Reaction_Wrong Technique In Product Usage Process
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
Y_serious.head()

0    1
1    1
2    1
3    1
4    1
Name: Serious, dtype: int64

In [16]:
X_reaction.head()

Unnamed: 0,Serious,Sex,Patient Age,Patient Weight,Product_.Alpha.-Glucose-1-Phosphate Disodium Tetrahydrate,Product_.Alpha.-Tocopherol,Product_.Alpha.-Tocopherol Acetate,"Product_.Alpha.-Tocopherol Acetate, D-","Product_.Alpha.-Tocopherol Acetate, Dl-","Product_.Alpha.-Tocopherol Succinate, D-",...,Product_Warfarin Potassium,Product_Warfarin Sodium,Product_Xantofyl Palmitate,Product_Xylometazoline,Product_Xylometazoline Hydrochloride,Product_Zeaxanthin,Product_Zinc,Product_Zoledronic Acid,Product_Zolpidem Tartrate,Product_Zopiclone
0,1,1,77.0,65.6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,78.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,69.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,79.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,76.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
Y_reaction.head()

Unnamed: 0,Reaction_Abdominal Distension,Reaction_Abdominal Pain,Reaction_Abdominal Pain Upper,Reaction_Acute Kidney Injury,Reaction_Anaemia,Reaction_Anxiety,Reaction_Appendicitis,Reaction_Appendicolith,Reaction_Arthralgia,Reaction_Ascites,...,Reaction_Vasculitis,Reaction_Ventricular Fibrillation,Reaction_Vision Blurred,Reaction_Visual Impairment,Reaction_Vomiting,Reaction_Weight Decreased,Reaction_Weight Increased,Reaction_Wheezing,Reaction_Wrong Product Administered,Reaction_Wrong Technique In Product Usage Process
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Split Serious Data

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Address class imbalance using SMOTE
smote = SMOTE()
X_resampled_serious, y_resampled_serious = smote.fit_resample(X_serious, Y_serious)

# Standardize the data
scaler = StandardScaler()
X_resampled_serious = scaler.fit_transform(X_resampled_serious)

from sklearn.model_selection import train_test_split

X_train_serious, X_test_serious, y_train_serious, y_test_serious = train_test_split(X_resampled_serious, y_resampled_serious, test_size=0.4, random_state=26)

X_train_serious, X_val_serious, y_train_serious, y_val_serious = train_test_split(X_train_serious, y_train_serious, test_size=0.2, random_state=26)


In [None]:
import warnings
warnings.filterwarnings("ignore")

!pip install torch -q

In [None]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader

# Convert data to torch tensors
class Data(Dataset):
    def __init__(self, X, y):
        #self.X = torch.from_numpy(X.to_numpy().astype(np.float32))
        #self.y = torch.from_numpy(y.to_numpy().astype(np.float32))
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = torch.from_numpy(y.to_numpy().astype(np.float32))
        self.len = self.X.shape[0]
       
    def __getitem__(self, index):
        return self.X[index], self.y[index]
   
    def __len__(self):
        return self.len
   
batch_size = 32

# Instantiate training and test data
train_data = Data(X_train_serious, y_train_serious)
#train_data = Data(X_resampled, y_resampled)
train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

test_data = Data(X_test_serious, y_test_serious)
test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)

valid_data = Data(X_val_serious, y_val_serious)
valid_dataloader = DataLoader(dataset=valid_data, batch_size=batch_size, shuffle=False)

# Check it's working
for batch, (X, y) in enumerate(train_dataloader):
    print(f"Batch: {batch+1}")
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    break

In [None]:
import torch
from torch import nn
from torch import optim

input_dim = len(feature_cols) #393
hidden_dim = 1024 #512 #400
output_dim = 1

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(NeuralNetwork, self).__init__()
        
        # Define multiple layers
        self.layer_1 = nn.Linear(input_dim, hidden_dim // 16)
        nn.init.kaiming_uniform_(self.layer_1.weight, nonlinearity="relu")
        
        self.layer_2 = nn.Linear(hidden_dim // 16, hidden_dim // 32)  # Hidden layer (reduce size)
        nn.init.kaiming_uniform_(self.layer_2.weight, nonlinearity="relu")
        
        self.layer_3 = nn.Linear(hidden_dim // 32, hidden_dim // 64)  # Another hidden layer
        nn.init.kaiming_uniform_(self.layer_3.weight, nonlinearity="relu")

        self.dropout = nn.Dropout(p=0.2)
        
        self.output_layer = nn.Linear(hidden_dim // 64, output_dim)  # Final output layer
        
    def forward(self, x):
        # Pass through layers with ReLU activations
        x = torch.nn.functional.relu(self.layer_1(x))
        x = self.dropout(x)

        x = torch.nn.functional.relu(self.layer_2(x))
        x = self.dropout(x)

        x = torch.nn.functional.relu(self.layer_3(x))
        x = self.dropout(x)

        x = torch.nn.functional.sigmoid(self.output_layer(x))  # Sigmoid for binary output
        return x
       
serious_model = NeuralNetwork(input_dim, hidden_dim, output_dim)
print(serious_model)

In [None]:
learning_rate = 0.01

loss_fn = nn.BCELoss()

optimizer = torch.optim.SGD(serious_model.parameters(), lr=learning_rate)

In [None]:
num_epochs = 15
train_loss_values = []
valid_loss_values = []

for epoch in range(num_epochs):
    # Training phase
    serious_model.train()
    train_loss = 0.0
    for X, y in train_dataloader:
        # zero the parameter gradients
        optimizer.zero_grad()
       
        # forward pass
        pred = serious_model(X)
        loss = loss_fn(pred, y.unsqueeze(-1))
        train_loss += loss.item()

        # backward pass and optimization
        loss.backward()
        optimizer.step()

    # average training loss for the epoch
    train_loss /= len(train_dataloader)
    train_loss_values.append(train_loss)

    # Validation phase
    serious_model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for X_val, y_val in valid_dataloader:
            pred = serious_model(X_val)
            loss = loss_fn(pred, y_val.unsqueeze(-1))
            val_loss += loss.item()
    
    # Average validation loss for the epoch
    val_loss /= len(valid_dataloader)
    valid_loss_values.append(val_loss)

    # Print loss for the epoch
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

Analyze Model

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(1, num_epochs + 1), train_loss_values, label='Train Loss')
plt.plot(range(1, num_epochs + 1), valid_loss_values, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

In [None]:
import itertools  # Import this at the top of your script

# Initialize required variables
y_pred = []
y_test = []
correct = 0
total = 0

"""
We're not training so we don't need to calculate the gradients for our outputs
"""
with torch.no_grad():
    for X, y in test_dataloader:
        outputs = serious_model(X)  # Get model outputs
        predicted = np.where(outputs.numpy() < 0.45, 0, 1)  # Convert to NumPy and apply threshold
        predicted = list(itertools.chain(*predicted))  # Flatten predictions
        y_pred.append(predicted)  # Append predictions
        y_test.append(y.numpy())  # Append true labels as NumPy
        total += y.size(0)  # Increment total count
        correct += (predicted == y.numpy()).sum().item()  # Count correct predictions

print(f'Accuracy of the network on the test instances: {100 * correct // total}%')

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import seaborn as sns

y_pred = list(itertools.chain(*y_pred))
y_test = list(itertools.chain(*y_test))

print(classification_report(y_test, y_pred))

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred)
cf_matrix

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Calculate ROC curve metrics
fpr, tpr, thresholds = roc_curve(y_test, y_pred)

# Calculate AUC (Area Under the Curve)
roc_auc = roc_auc_score(y_test, y_pred)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="blue", label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")  # Diagonal line (random performance)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.grid()
plt.show()

Split Reaction Data

In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

def ml_smote(X, Y, k=5, target_samples=None):
    """
    MLSMOTE implementation for multi-label datasets.
    
    Parameters:
        X (array): Feature matrix (NumPy array or DataFrame).
        Y (array): Multi-label matrix (NumPy array or DataFrame).
        k (int): Number of nearest neighbors.
        n_samples (int): Number of synthetic samples to generate.
    
    Returns:
        X_augmented, Y_augmented: Augmented feature and label matrices.
    """
    # Calculate imbalance ratio per label (IRPL)
    label_frequencies = Y.sum(axis=0)
    irpl = max(label_frequencies) / label_frequencies
    
    # Calculate mean imbalance ratio (MIR)
    mir = np.mean(irpl)
    
    # Identify tail labels (labels with IRPL > MIR)
    tail_labels = np.where(irpl > mir)[0]
    print(tail_labels)
    
    # Identify minority samples
    #minority_indices = np.where(Y.sum(axis=1) < Y.shape[1] / 2)[0]  # Example threshold
    minority_indices = np.where((Y[:, tail_labels].sum(axis=1)) > 0)[0]
    X_minority = X[minority_indices]
    Y_minority = Y[minority_indices]
    
    # Fit nearest neighbors
    knn = NearestNeighbors(n_neighbors=k).fit(X_minority)
    synthetic_X, synthetic_Y = [], []

    # Calculate target samples for each tail label
    if target_samples is None:
        target_samples = (label_frequencies.max() - label_frequencies).astype(int)

    #for _ in range(n_samples):
    for _ in range(target_samples.sum()):
        # Randomly select a minority sample
        idx = np.random.choice(len(X_minority))
        x = X_minority[idx]
        labels = Y_minority[idx]

        # Find k-nearest neighbors
        neighbors = knn.kneighbors([x], return_distance=False)[0]
        #neighbor_idx = np.random.choice(neighbors[1:])  # Avoid selecting the sample itself
        selected_neighbors = neighbors[1:]  # Exclude itself

        # Interpolate between the sample and a neighbor
        #neighbor_x = X_minority[neighbor_idx]
        #synthetic_x = x + np.random.rand() * (neighbor_x - x)
        #synthetic_y = np.logical_or(labels, Y_minority[neighbor_idx]).astype(int)

        # Generate synthetic features by averaging neighbors
        weights = np.random.dirichlet(np.ones(len(selected_neighbors)))
        neighbor_x = np.dot(weights, X_minority[selected_neighbors])
        synthetic_x = x + np.random.rand() * (neighbor_x - x)

        # Generate synthetic labels probabilistically
        synthetic_y = (np.random.rand(len(labels)) < 0.5).astype(int) * labels
        synthetic_y = np.logical_or(synthetic_y, Y_minority[neighbors[1]]).astype(int)

        synthetic_X.append(synthetic_x)
        synthetic_Y.append(synthetic_y)
    
    # Combine original and synthetic data
    X_augmented = np.vstack([X, np.array(synthetic_X)])
    Y_augmented = np.vstack([Y, np.array(synthetic_Y)])
    return X_augmented, Y_augmented

# Example usage with your data
X_resampled_reaction, Y_resampled_reaction = ml_smote(X_reaction.values, Y_reaction.values, k=5) #, n_samples=1000)

print("Original data shape:", X_reaction.shape, Y_reaction.shape)
print("Resampled data shape:", X_resampled_reaction.shape, Y_resampled_reaction.shape)

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from collections import Counter
import numpy as np

# Address class imbalance using SMOTE
#smote = SMOTE()
#X_resampled, y_resampled = smote.fit_resample(X, Y)

#X_resampled = X.copy()
#y_resampled = Y.copy()

print(X_resampled_reaction.shape)
print(Y_resampled_reaction.shape)

# Standardize the data
scaler = StandardScaler()
X_resampled_reaction = scaler.fit_transform(X_resampled_reaction)

print(X_resampled_reaction.shape)
print(Y_resampled_reaction.shape)

from sklearn.model_selection import train_test_split

X_train_reaction, X_test_reaction, y_train_reaction, y_test_reaction = train_test_split(X_resampled_reaction, Y_resampled_reaction, test_size=0.4, random_state=26)

X_train_reaction, X_val_reaction, y_train_reaction, y_val_reaction = train_test_split(X_train_reaction, y_train_reaction, test_size=0.2, random_state=26)

In [None]:
import numpy as np

# Count occurrences of 1s for each label (column-wise sum)
label_counts = np.sum(Y, axis=0)
print("Label Counts for Each Label:", label_counts)

# Compute weights inversely proportional to label frequencies
weights = 1.0 / label_counts  # Higher weights for minority labels
print("Sampling Weights for Each Label:", weights)

In [None]:
column_sums = Y_resampled_reaction.sum(axis=0)
print("Sum of each column:", column_sums)

In [None]:
import matplotlib.pyplot as plt
plt.bar(range(len(column_sums)), column_sums)
plt.xlabel("Labels")
plt.ylabel("Occurrences")
plt.title("Label Distribution After Resampling")
plt.show()

In [None]:
import warnings
warnings.filterwarnings("ignore")

!pip install torch -q

In [None]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

# Convert data to torch tensors
class Data(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = torch.from_numpy(y.astype(np.float32))  # to_numpy() #Ensure y is a torch tensor
        self.len = self.X.shape[0]

    def __getitem__(self, index):
        return self.X[index], self.y[index]

    def __len__(self):
        return self.len


# Compute sample weights for the training data
def compute_sample_weights(y, label_weights):
    # Convert label weights (Pandas Series) to NumPy array
    label_weights_array = label_weights.values
    # Calculate sample weights as the sum of label weights for each row
    sample_weights = np.dot(y, label_weights_array) # .to_numpy()
    return torch.tensor(sample_weights, dtype=torch.float)


# Set batch size
batch_size = 512 # 256 # 128 # 64

# Instantiate training, validation, and test datasets
train_data = Data(X_train_reaction, y_train_reaction)
valid_data = Data(X_val_reaction, y_val_reaction)
test_data = Data(X_test_reaction, y_test_reaction)

# Compute sample weights for the training data
sample_weights = compute_sample_weights(y_train_reaction, weights)

# Create a WeightedRandomSampler for the training data
sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),  # Ensure all samples are considered
    replacement=True  # Allows oversampling of minority samples
)

# Create DataLoaders for training, validation, and test data
train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, sampler=sampler)  # Use sampler for training
valid_dataloader = DataLoader(dataset=valid_data, batch_size=batch_size, shuffle=False)   # No sampler for validation
test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False)     # No sampler for testing

# Check that it's working
for batch, (X, y) in enumerate(train_dataloader):
    print(f"Batch: {batch + 1}")
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    break

total_label_sums = torch.zeros(y_train_reaction.shape[1])  # Adjust shape based on your dataset
for batch_X, batch_Y in train_dataloader:
    total_label_sums += batch_Y.sum(axis=0)
print("Total label distribution across batches:", total_label_sums)

In [None]:

import torch
from torch import nn
from torch import optim

input_dim = len(feature_cols) #393
hidden_dim = 4096 #400
output_dim = len(predict_cols)

class MultiOutputNeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MultiOutputNeuralNetwork, self).__init__()
        
        # Define multiple layers
        self.layer_1 = nn.Linear(input_dim, hidden_dim // 2 + output_dim)
        nn.init.kaiming_uniform_(self.layer_1.weight, nonlinearity="relu")
        
        self.layer_2 = nn.Linear(hidden_dim // 2 + output_dim, hidden_dim // 4 + output_dim)  # Hidden layer (reduce size)
        nn.init.kaiming_uniform_(self.layer_2.weight, nonlinearity="relu")
        
        self.layer_3 = nn.Linear(hidden_dim // 4 + output_dim, hidden_dim // 8 + output_dim)  # Another hidden layer
        nn.init.kaiming_uniform_(self.layer_3.weight, nonlinearity="relu")

        self.layer_4 = nn.Linear(hidden_dim // 8 + output_dim, hidden_dim // 16 + output_dim)  # Fourth hidden layer
        nn.init.kaiming_uniform_(self.layer_4.weight, nonlinearity="relu")
        
        self.layer_5 = nn.Linear(hidden_dim // 16 + output_dim, hidden_dim // 32 + output_dim)  # Fifth hidden layer
        nn.init.kaiming_uniform_(self.layer_5.weight, nonlinearity="relu")

        self.layer_6 = nn.Linear(hidden_dim // 32 + output_dim, hidden_dim // 64 + output_dim)  # Fourth hidden layer
        nn.init.kaiming_uniform_(self.layer_4.weight, nonlinearity="relu")
        
        self.layer_7 = nn.Linear(hidden_dim // 64 + output_dim, hidden_dim // 128 + output_dim)  # Fifth hidden layer
        nn.init.kaiming_uniform_(self.layer_5.weight, nonlinearity="relu")
        
        self.output_layer = nn.Linear(hidden_dim // 128 + output_dim, output_dim)  # Final output layer

        self.dropout = nn.Dropout(p=0.3)  # Define dropout
        
    def forward(self, x):
        # Pass through layers with ReLU activations
        x = torch.nn.functional.relu(self.layer_1(x))
        x = self.dropout(x)
        x = torch.nn.functional.relu(self.layer_2(x))
        x = self.dropout(x)
        x = torch.nn.functional.relu(self.layer_3(x))
        x = self.dropout(x)
        x = torch.nn.functional.relu(self.layer_4(x))
        x = self.dropout(x)
        x = torch.nn.functional.relu(self.layer_5(x))
        x = self.dropout(x)
        x = torch.nn.functional.relu(self.layer_6(x))
        x = self.dropout(x)
        x = torch.nn.functional.relu(self.layer_7(x))
        x = self.dropout(x)

        #x = torch.nn.functional.sigmoid(self.output_layer(x))  # Sigmoid for binary output
        #x = torch.sigmoid(self.output_layer(x))
        x = self.output_layer(x)
        return x
       
reaction_model = MultiOutputNeuralNetwork(input_dim, hidden_dim, output_dim)
print(reaction_model)

In [None]:
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss


learning_rate = 0.001

label_counts = np.sum(y_train_reaction, axis=0)
total_samples = y_train_reaction.shape[0]

class_weights = total_samples / (len(label_counts) * label_counts)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

loss_fn = BCEWithLogitsLoss(pos_weight=class_weights)
#loss_fn = nn.BCELoss()

optimizer = torch.optim.Adam(reaction_model.parameters(), lr=learning_rate)

#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [None]:

num_epochs = 50
train_loss_values = []
valid_loss_values = []

train_accuracy_values = []
valid_accuracy_values = []

for epoch in range(num_epochs):
    # Training phase
    reaction_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    for X, y in train_dataloader:
        # zero the parameter gradients
        optimizer.zero_grad()
        #print(X.shape)
       
        # forward pass
        pred = reaction_model(X)
        loss = loss_fn(pred, y) #.unsqueeze(-1))
        train_loss += loss.item()

        # Accuracy calculation
        train_correct += (torch.sigmoid(pred) >= 0.5).float().eq(y).sum().item()
        #train_correct += (pred >= 0.5).float().eq(y).sum().item()
        train_total += y.numel()

        # backward pass and optimization
        loss.backward()
        optimizer.step()

    # average training loss for the epoch
    train_loss /= len(train_dataloader)
    train_loss_values.append(train_loss)

    # Training accuracy
    train_accuracy = train_correct / train_total
    train_accuracy_values.append(train_accuracy)

    # Validation phase
    reaction_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for X_val, y_val in valid_dataloader:
            pred = reaction_model(X_val)
            loss = loss_fn(pred, y_val) #.unsqueeze(-1))
            val_loss += loss.item()

            # Accuracy calculation
            val_correct += (torch.sigmoid(pred) >= 0.5).float().eq(y_val).sum().item()
            #val_correct += (pred >= 0.5).float().eq(y_val).sum().item()
            val_total += y_val.numel()
    
    # Average validation loss for the epoch
    val_loss /= len(valid_dataloader)
    valid_loss_values.append(val_loss)

    # Validation accuracy
    val_accuracy = val_correct / val_total
    valid_accuracy_values.append(val_accuracy)

    # Print loss for the epoch
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}")

print("Training Complete")

Analyze Reaction Model

In [None]:
# Plotting Loss and Accuracy
epochs = range(1, num_epochs + 1)

plt.figure(figsize=(12, 6))

# Loss plot
plt.subplot(1, 2, 1)
plt.plot(epochs, train_loss_values, label='Train Loss')
plt.plot(epochs, valid_loss_values, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

# Accuracy plot
plt.subplot(1, 2, 2)
plt.plot(epochs, train_accuracy_values, label='Train Accuracy')
plt.plot(epochs, valid_accuracy_values, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
label_counts = y_train_reaction.sum(axis=0)
print("Label counts in training data:", label_counts)

In [None]:
from sklearn.metrics import precision_recall_curve, classification_report, hamming_loss
from scipy.special import expit  # Sigmoid function
import numpy as np

# Step 1: Dynamically calculate thresholds for all labels
def calculate_thresholds(y_true, y_probs):
    """
    Calculate optimal thresholds for each label based on precision-recall curves.
    
    Parameters:
        y_true (array): True binary labels (shape: [n_samples, n_labels]).
        y_probs (array): Predicted probabilities (shape: [n_samples, n_labels]).
    
    Returns:
        thresholds_array (array): Optimal thresholds for each label.
    """
    thresholds = []
    for i in range(y_true.shape[1]):  # Iterate over all labels dynamically
        precision, recall, thresholds_curve = precision_recall_curve(y_true[:, i], y_probs[:, i])
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)  # Avoid division by zero
        optimal_threshold = thresholds_curve[f1_scores.argmax()]  # Find the best threshold
        thresholds.append(optimal_threshold)
    return np.array(thresholds)

# Step 2: Generate predictions using custom thresholds
def apply_thresholds(y_probs, thresholds_array):
    """
    Apply custom thresholds to convert probabilities into binary predictions.
    
    Parameters:
        y_probs (array): Predicted probabilities (shape: [n_samples, n_labels]).
        thresholds_array (array): Thresholds for each label.
    
    Returns:
        y_pred (array): Binary predictions (shape: [n_samples, n_labels]).
    """
    return (y_probs >= thresholds_array).astype(int)

# Use these functions dynamically
with torch.no_grad():
    y_probs = []  # Collect probabilities
    y_true = []  # Collect true labels

    for X, y in test_dataloader:
        outputs = reaction_model(X)  # Get raw logits
        #y_probs.append(outputs.numpy())  # Store probabilities
        y_probs.append(expit(outputs.numpy()))  # Convert logits to probabilities
        y_true.append(y.numpy())  # Store true labels

    # Combine batches into full arrays
    y_probs = np.concatenate(y_probs, axis=0)
    y_true = np.concatenate(y_true, axis=0)

    # Dynamically calculate thresholds
    thresholds_array = calculate_thresholds(y_true, y_probs)
    print(thresholds_array)

    # Apply thresholds to generate binary predictions
    y_pred = apply_thresholds(y_probs, thresholds_array)

    # Evaluate the model
    print(classification_report(y_true, y_pred, target_names=[f"Label {i}" for i in range(y_true.shape[1])]))
    print(f"Hamming Loss: {hamming_loss(y_true, y_pred):.4f}")

In [None]:
import itertools  # Import this at the top of your script

# Initialize required variables
y_pred = []
y_test = []
correct = 0
total = 0

"""
We're not training so we don't need to calculate the gradients for our outputs
"""
with torch.no_grad():
    for X, y in test_dataloader:
        outputs = reaction_model(X)  # Get model outputs
        predicted = np.where(outputs.numpy() < 0.5, 0, 1)  # Convert to NumPy and apply threshold
        #predicted = list(itertools.chain(*predicted))  # Flatten predictions
        y_pred.append(predicted)  # Append predictions
        y_test.append(y.numpy())  # Append true labels as NumPy
        total += y.size(0)  # Increment total count
        correct += (predicted == y.numpy()).sum().item()  # Count correct predictions

#print(f'Accuracy of the network on the test instances: {100 * correct // total}%')
y_pred = np.concatenate(y_pred, axis=0)  # Combine batches into a single array
y_test = np.concatenate(y_test, axis=0)  # Combine batches into a single array
total = y_test.size  # Total number of labels
correct = (y_pred == y_test).sum()  # Total number of correct predictions
accuracy = 100 * correct / total
print(f'Accuracy of the network on the test instances: {accuracy:.2f}%')

In [None]:

import numpy as np
import random
import torch

# Exclude specific columns
columns_to_exclude = ['Case ID', 'Suspect Product Active Ingredients', 'Sex', 'Patient Age', 'Patient Weight', 'Serious']

# Create drug-related columns
drug_cols = [col for col in df_encoded.columns if col not in columns_to_exclude]

# Generate random drug data
random_drugs = random.sample(drug_cols, 3)#["Insulin Pork\Insulin Purified Pork", "Insulin Beef"] #random.sample(drug_cols, 3)
print("Patient Drugs: ", random_drugs)
drug_array = np.zeros(len(drug_cols))
column_indices = [df_encoded.columns.get_loc(col) for col in random_drugs]
for idx in column_indices:
    drug_array[idx - 5] = 1

# Create a specific patient profile
specific_profile = np.array([[1, 83, 65]])  # Adjust profile values as needed
specific_profile = np.concatenate([specific_profile, drug_array.reshape(1, -1)], axis=1)

# Standardize the profile using the same scaler used during training
specific_profile_scaled = scaler.transform(specific_profile)  # 'scaler' is the StandardScaler from training

# Convert the profile to a PyTorch tensor
specific_profile_tensor = torch.tensor(specific_profile_scaled, dtype=torch.float32)

# Predict the outcome using your PyTorch model
serious_model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    prediction = serious_model(specific_profile_tensor)
    predicted_outcome = (prediction >= 0.5).float().item()  # Threshold of 0.5
    predicted_probability = prediction.item()

print(f"Serious Model Prediction: Outcome = {predicted_outcome}, Probability = {predicted_probability:.4f}")

specific_profile_reaction =  torch.cat([specific_profile_tensor, torch.tensor([[predicted_outcome]], dtype=torch.float32)], dim=1)

reaction_model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    serious_prediction = reaction_model(specific_profile_reaction)  # Raw logits
    #predicted_outcomes = (torch.sigmoid(prediction) >= 0.5).float()  # Apply sigmoid and threshold
    serious_predicted_probabilities = torch.sigmoid(serious_prediction).numpy()  # Convert logits to probabilities
    #predicted_probabilities = prediction.numpy()  # Convert logits to probabilities
    serious_predicted_outcomes = (serious_predicted_probabilities >= thresholds_array).astype(float)  # Apply sigmoid and threshold


# Initialize an empty list to store column names with "Yes" outcomes and their probabilities
yes_labels_with_probabilities = []

# Iterate over the predicted outcomes and corresponding probabilities
for outcome, probability, column_name in zip(
    serious_predicted_outcomes.flatten(), 
    #prediction.numpy().flatten(), 
    torch.sigmoid(prediction).numpy().flatten(),
    Y_reaction.columns  
):
    if outcome == 1:  # Check if the outcome is "Yes"
        # Append column name and probability to the list
        yes_labels_with_probabilities.append((column_name, probability))

# Print the list of column names with "Yes" outcomes and their probabilities
print("Columns with 'Yes' outcomes and their probabilities:")
for column_name, probability in yes_labels_with_probabilities:
    print(f"{column_name}: Probability = {probability:.4f}")