In [1]:
import json
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score
import joblib
import tkinter as tk
from tkinter import ttk

# ----------------------------------------------------------------------------
# 1. Define File Paths
# ----------------------------------------------------------------------------
json_file_paths = [
    r"C:\Users\colli\OneDrive\Desktop\Adverse effects 2\drug-event-0001-of-0005.json",
    r"C:\Users\colli\OneDrive\Desktop\Adverse effects 2\drug-event-0002-of-0005.json",
    r"C:\Users\colli\OneDrive\Desktop\Adverse effects 2\drug-event-0003-of-0005.json",
    r"C:\Users\colli\OneDrive\Desktop\Adverse effects 2\drug-event-0004-of-0005.json",
    r"C:\Users\colli\OneDrive\Desktop\Adverse effects 2\drug-event-0005-of-0005.json"
]

# ----------------------------------------------------------------------------
# 2. Helper function for dosage
# ----------------------------------------------------------------------------
def parse_numeric_dosage(dosage_str):
    """
    Attempt to parse a numeric dosage from the dosage string.
    e.g., "10 mg" -> 10.0, "5 mg 2X daily" -> 5.0, etc.
    """
    if not dosage_str:
        return None
    import re
    match = re.search(r"(\d+(\.\d+)?)", dosage_str)
    if match:
        try:
            return float(match.group(1))
        except ValueError:
            return None
    return None

# ----------------------------------------------------------------------------
# 3. Parse JSON + Build Rows (Memory-Efficient)
# ----------------------------------------------------------------------------
def process_json_file(path):
    """
    Process a single JSON file and yield rows one at a time.
    """
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
        results = data.get("results", [])
        
        for report in results:
            patient_info = report.get("patient", {})
            
            # Age
            age_str = patient_info.get("patientonsetage", None)
            if age_str:
                try:
                    age_val = float(age_str)
                except ValueError:
                    age_val = None
            else:
                age_val = None
            
            # Sex
            sex_str = patient_info.get("patientsex", None)
            if sex_str == "1":
                sex_val = "M"
            elif sex_str == "2":
                sex_val = "F"
            else:
                sex_val = "U"  # Unknown or missing
            
            if (age_val is None) or (sex_val == "U"):
                continue
            
            # Reactions
            reaction_list = []
            for rxn in patient_info.get("reaction", []):
                meddra_pt = rxn.get("reactionmeddrapt", "").upper().strip()
                if meddra_pt:
                    reaction_list.append(meddra_pt)
            if not reaction_list:
                continue
            
            # Drugs, dosage, route
            drug_list = []
            route_list = set()
            dosage_values = []
            for d in patient_info.get("drug", []):
                product_name = d.get("medicinalproduct", "").upper().strip()
                if product_name:
                    drug_list.append(product_name)
                
                route = d.get("drugadministrationroute", "").upper().strip()
                if route:
                    route_list.add(route)
                
                dosage_str = d.get("drugdosagetext", "")
                numeric_dose = parse_numeric_dosage(dosage_str)
                if numeric_dose is not None:
                    dosage_values.append(numeric_dose)
            
            if not drug_list:
                continue
            
            avg_dosage = np.mean(dosage_values) if dosage_values else 0.0
            
            row_data = {
                "age": age_val,
                "sex": sex_val,
                "drugs": list(set(drug_list)),
                "routes": list(route_list),
                "avg_dosage": avg_dosage,
                "reactions": list(set(reaction_list))
            }
            yield row_data

# Process all files and store rows in a list
all_rows = []
for path in json_file_paths:
    if not os.path.exists(path):
        print(f"Warning: file not found: {path}")
        continue
    for row in process_json_file(path):
        all_rows.append(row)

# Convert to DataFrame
df = pd.DataFrame(all_rows)
print(f"Total usable rows: {len(df)}")
df.dropna(subset=["age", "sex", "drugs", "reactions"], inplace=True)
df = df[(df["age"] > 0) & (df["age"] <= 120)]
print("DataFrame shape after cleaning:", df.shape)
print(df.info())
print(df.describe())
print(df.head())

Total usable rows: 35641
DataFrame shape after cleaning: (35433, 6)
<class 'pandas.core.frame.DataFrame'>
Index: 35433 entries, 0 to 35640
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         35433 non-null  float64
 1   sex         35433 non-null  object 
 2   drugs       35433 non-null  object 
 3   routes      35433 non-null  object 
 4   avg_dosage  35433 non-null  float64
 5   reactions   35433 non-null  object 
dtypes: float64(2), object(4)
memory usage: 1.9+ MB
None
                age    avg_dosage
count  35433.000000  3.543300e+04
mean      53.804645  5.627632e+02
std       19.698021  4.505441e+04
min        1.000000  0.000000e+00
25%       42.000000  0.000000e+00
50%       56.000000  5.000000e+00
75%       68.000000  4.000000e+01
max      109.000000  7.250000e+06
    age sex                                              drugs      routes  \
0  59.0   F  [GLUCOVANCE, NEURONTIN, CALCIUM CHLORIDE, COZA.

In [2]:
# Encode sex
sex_encoder = LabelEncoder()
df["sex_encoded"] = sex_encoder.fit_transform(df["sex"])

# Encode top 30 drugs
all_drugs = [drug for sublist in df["drugs"] for drug in sublist]
drug_counts = pd.Series(all_drugs).value_counts()
top_30_drugs = drug_counts.head(30).index.tolist()

def encode_drug_list(drug_list):
    return [1 if d in drug_list else 0 for d in top_30_drugs]

drug_feature_vectors = df["drugs"].apply(encode_drug_list)
X_drugs = np.vstack(drug_feature_vectors.values)

# Encode top 5 routes
all_routes = [route for sublist in df["routes"] for route in sublist]
route_counts = pd.Series(all_routes).value_counts()
top_5_routes = route_counts.head(5).index.tolist()

def encode_routes(route_list):
    return [1 if rt in route_list else 0 for rt in top_5_routes]

route_feature_vectors = df["routes"].apply(encode_routes)
X_routes = np.vstack(route_feature_vectors.values)

# Final feature matrix: [age, sex_encoded, avg_dosage] + drug presence + route presence
X_demographics = df[["age", "sex_encoded", "avg_dosage"]].values
X = np.hstack([X_demographics, X_drugs, X_routes])

# Encode multi-label reactions
mlb = MultiLabelBinarizer(sparse_output=True)  # Use sparse output to save memory
Y = mlb.fit_transform(df["reactions"])

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [None]:
import time

start_time = time.time()


# Define the models and their hyperparameters for tuning
models = {
    "Decision Tree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "estimator__max_depth": [20],  # Use estimator__ prefix
            "estimator__min_samples_split": [2]   # Use estimator__ prefix
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42, n_jobs=-1),
        "params": {
            "estimator__n_estimators": [100],   # Use estimator__ prefix
            "estimator__max_depth": [20],      # Use estimator__ prefix
            "estimator__min_samples_split": [2]   # Use estimator__ prefix
        }
    },
    "SVM": {
        "model": SVC(kernel='linear', probability=True, random_state=42),
        "params": {
            "estimator__C": [1]  # Use estimator__ prefix
        }
    }
}

# Convert sparse matrices to dense format
y_train_dense = y_train.toarray() if hasattr(y_train, 'toarray') else y_train
y_test_dense = y_test.toarray() if hasattr(y_test, 'toarray') else y_test

# Perform GridSearchCV for each model
best_models = {}
for model_name, model_info in models.items():
    print(f"Training and tuning {model_name}...")
    grid_search = GridSearchCV(
        estimator=MultiOutputClassifier(model_info["model"]),
        param_grid=model_info["params"],
        scoring='f1_micro',
        cv=3,
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train_dense)  # Use the dense format here
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best F1 score for {model_name}: {grid_search.best_score_}")
    print("-" * 50)

# Evaluate the best models on the test set
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test_dense, y_pred, average='micro')  # Use the dense format here
    accuracy = accuracy_score(y_test_dense, y_pred)
    print(f"{model_name} - Test F1 Score: {f1:.4f}, Test Accuracy: {accuracy:.4f}")

# Select the best model based on F1 score
best_model_name = max(best_models, key=lambda x: f1_score(y_test_dense, best_models[x].predict(X_test), average='micro'))
best_model = best_models[best_model_name]
print(f"The best model is: {best_model_name}")

# Save the best model
joblib.dump(best_model, "best_adr_model.pkl")
print("Best model saved as 'best_adr_model.pkl'")


end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Training and tuning Decision Tree...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best parameters for Decision Tree: {'estimator__max_depth': 20, 'estimator__min_samples_split': 2}
Best F1 score for Decision Tree: 0.04613834045067453
--------------------------------------------------
Training and tuning Random Forest...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
