<a href="https://colab.research.google.com/github/Adebesin-Aramide/Ulcer_Management_System/blob/main/Anomally_flares.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#🔹Imports

In [20]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.metrics import f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

#🔹Load & Preprocess Data

In [21]:

# Load data
df = pd.read_csv("/content/Realistic_Uncertain_Ulcer_Dataset.csv")
realistic_df = df.copy()

#Feature Engineering

In [22]:
#Clean multi-label fields
def safe_split(x):
    if pd.isna(x):
        return []
    return [i.strip().lower() for i in str(x).split(';') if i.strip()]

df['Symptoms'] = df['Symptoms'].apply(safe_split)
df['Meals'] = df['Meals'].apply(safe_split)

In [23]:
#Define flare-up labeling logic
def define_flare(row):
    key_symptoms = ['nausea', 'heartburn', 'dark or tarry stools', 'loss of appetite', 'blood in vomit/stool']
    symptoms = row['Symptoms']
    symptom_count = sum(1 for s in symptoms if s in key_symptoms)

    pain = row['PainRating']
    stress = row['StressLevel']
    nsaid = str(row['TookNSAID']).strip().lower() == 'yes'
    skipped = str(row['SkippedMeal']).strip().lower() == 'yes'
    triggers = str(row['AteTriggers']).strip().lower() == 'yes'

    # Core flare condition
    flare_core = pain >= 5 and (symptom_count >= 2 or stress >= 7 or nsaid or skipped or triggers)

    # Introduce soft randomness: 10% chance a borderline entry is flipped
    random_factor = np.random.rand()
    if flare_core and random_factor < 0.05:
        return 0  # false negative
    elif not flare_core and random_factor < 0.05:
        return 1  # false positive
    else:
        return int(flare_core)

# Step 3: Apply flare-up logic
df['IsFlare'] = df.apply(define_flare, axis=1)

# Show distribution of new labels
df['IsFlare'].value_counts()


Unnamed: 0_level_0,count
IsFlare,Unnamed: 1_level_1
1,80
0,70


In [24]:
df_model = df.copy()
# Encode binary categorical features
binary_map = {'Yes': 1, 'No': 0, 'Not sure': np.nan}
for col in ['TakeUlcerMed', 'AteTriggers', 'SkippedMeal', 'AteLate', 'TookNSAID', 'CancerDiag', 'FamilyHistory', 'HpyloriUlcer']:
    df_model[col] = df_model[col].map(lambda x: binary_map.get(str(x).strip().title(), np.nan))

In [25]:
#Label encode Gender
df_model['Gender'] = LabelEncoder().fit_transform(df_model['Gender'].astype(str).str.strip().str.lower())

In [26]:

# Map Duration
duration_map = {'<30 mins': 0, '30 mins–2 hrs': 1, '>2 hrs': 2}
df_model['Duration'] = df_model['Duration'].map(duration_map)

In [27]:
# Step 2: MultiLabel Binarization for Meals and Symptoms
mlb_meals = MultiLabelBinarizer()
mlb_symptoms = MultiLabelBinarizer()

meals_encoded = pd.DataFrame(mlb_meals.fit_transform(df_model['Meals']),
                              columns=[f"Meal_{m}" for m in mlb_meals.classes_],
                              index=df_model.index)

symptoms_encoded = pd.DataFrame(mlb_symptoms.fit_transform(df_model['Symptoms']),
                                 columns=[f"Symptom_{s}" for s in mlb_symptoms.classes_],
                                 index=df_model.index)

In [28]:
# Modeling data
df_model_final = pd.concat([
    df_model.drop(columns=['Date', 'LogTimestamp', 'MedTime', 'TriggerCauses', 'Symptoms', 'Meals']),
    meals_encoded,
    symptoms_encoded
], axis=1)

In [29]:
# Fill missing values
for col in df_model_final.columns:
    if df_model_final[col].dtype in [np.float64, np.int64]:
        df_model_final[col] = df_model_final[col].fillna(df_model_final[col].median())
    else:
        df_model_final[col] = df_model_final[col].fillna(df_model_final[col].mode()[0])


# 🔹Train and Evaluate Model

In [30]:
# Define features and target
X = df_model_final.drop(columns=['IsFlare', 'CancerDiag'])
y = df_model_final['IsFlare']

# Split data into Train (60%), Validation (20%), Test (20%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42)

In [31]:
# Train Random Forest with regularization
model_rf = RandomForestClassifier(n_estimators=100, max_depth=7, class_weight='balanced', random_state=42)
model_rf.fit(X_train, y_train)

In [32]:
#Evaluate on validation and test sets
val_pred = model_rf.predict(X_val)
val_proba = model_rf.predict_proba(X_val)[:, 1]

test_pred = model_rf.predict(X_test)
test_proba = model_rf.predict_proba(X_test)[:, 1]


In [33]:
#Collect evaluation metrics
evaluation_final = {
    "Validation": {
        "F1 Score": f1_score(y_val, val_pred),
        "ROC AUC Score": roc_auc_score(y_val, val_proba),
        "Classification Report": classification_report(y_val, val_pred),
        "Confusion Matrix": confusion_matrix(y_val, val_pred).tolist()
    },
    "Test": {
        "F1 Score": f1_score(y_test, test_pred),
        "ROC AUC Score": roc_auc_score(y_test, test_proba),
        "Classification Report": classification_report(y_test, test_pred),
        "Confusion Matrix": confusion_matrix(y_test, test_pred).tolist()
    }
}

evaluation_final


{'Validation': {'F1 Score': 0.9696969696969697,
  'ROC AUC Score': np.float64(0.9776785714285715),
  'Classification Report': '              precision    recall  f1-score   support\n\n           0       1.00      0.93      0.96        14\n           1       0.94      1.00      0.97        16\n\n    accuracy                           0.97        30\n   macro avg       0.97      0.96      0.97        30\nweighted avg       0.97      0.97      0.97        30\n',
  'Confusion Matrix': [[13, 1], [0, 16]]},
 'Test': {'F1 Score': 0.967741935483871,
  'ROC AUC Score': np.float64(0.96875),
  'Classification Report': '              precision    recall  f1-score   support\n\n           0       0.93      1.00      0.97        14\n           1       1.00      0.94      0.97        16\n\n    accuracy                           0.97        30\n   macro avg       0.97      0.97      0.97        30\nweighted avg       0.97      0.97      0.97        30\n',
  'Confusion Matrix': [[14, 0], [1, 15]]}}

In [34]:
import joblib

# Save the trained model
joblib.dump(model_rf, "/content/drive/MyDrive/flareup_predictor_model.pkl")


['/content/drive/MyDrive/flareup_predictor_model.pkl']

In [35]:
# Save the encoders and binarizers
joblib.dump(mlb_meals, "/content/drive/MyDrive/mlb_meals.pkl")
joblib.dump(mlb_symptoms, "/content/drive/MyDrive/mlb_symptoms.pkl")


['/content/drive/MyDrive/mlb_symptoms.pkl']