In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [2]:
# Step 1: Load the pivoted dataset
df = pd.read_csv("pivoted.csv")

In [3]:
df.head(10)

Unnamed: 0,StudyID,Year,Gender,Age,Conditions that may increase Risk: Exposed roots; Deep pits/fissures; fixed or removable appliances present; Defective restorations margins,Most severe radiographically evident lesions,Number of cavities/restorations/extractions due to caries in the last 3 years,Number of teeth with cavitated or non cavitated (incipient) active lesions,Patient's Caries Risk Status,"Presence of an exposed pulp, fistula or abscess",Unstimulated Saliva Flow,"Visible dental plaque ""evidence of sticky plaque stagnation in at risk areas"""
0,P1000283,2018,Female,49.0,1 or 2,Dentin,3 or more,1 or 2,High,Yes,,Yes
1,P1000283,2023,Female,54.0,3 or more,Dentin,3 or more,3 or more,High,Yes,More than 0.2 ml/min,Yes
2,P1001748,2018,Male,82.0,1 or 2,,1 or 2,1 or 2,High,No,,Yes
3,P1001748,2019,Male,83.0,1 or 2,Dentin,1 or 2,1 or 2,High,No,,Yes
4,P1001823,2018,Female,80.0,,,,1 or 2,Moderate,No,,Yes
5,P1002323,2018,Male,43.0,,,,,Low,No,,No
6,P1002798,2023,Male,68.0,1 or 2,Enamel only,3 or more,1 or 2,Moderate,No,,Yes
7,P1002973,2018,Male,67.0,3 or more,Dentin,1 or 2,3 or more,High,,,Yes
8,P1002973,2019,Male,67.0,,,,,,Yes,,
9,P1003018,2017,Male,82.0,,,,,Moderate,,,Yes


In [4]:
# Step 2: Handle missing target values
df = df.dropna(subset=["Patient's Caries Risk Status"])

In [5]:
# Step 3: Normalize missing responses (e.g. 'none', empty strings) across the dataset
df = df.fillna("missing")
df = df.replace(['none', 'NaN', 'nan', ''], 'missing')

In [6]:
# Step 4: Encode the target variable
label_encoder = LabelEncoder()
df["EncodedRisk"] = label_encoder.fit_transform(df["Patient's Caries Risk Status"])

In [7]:
# Step 5: Prepare features (X) and target (y)
X = df.drop(columns=["StudyID", "Patient's Caries Risk Status", "EncodedRisk"])
y = df["EncodedRisk"]

In [8]:
# Step 6: One-hot encode categorical features
X_encoded = pd.get_dummies(X)

In [9]:
# Step 7: Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, stratify=y, random_state=42
)

In [10]:
# Step 8: Apply SMOTE to balance the classes
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

In [11]:
# Step 9: Train XGBoost classifier
model = XGBClassifier(
    max_depth=8,
    n_estimators=500,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
model.fit(X_train_resampled, y_train_resampled)

Parameters: { "use_label_encoder" } are not used.



In [12]:
# Step 10: Evaluate on test data
y_pred = model.predict(X_test)
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


📊 Classification Report:
              precision    recall  f1-score   support

        High       0.82      0.73      0.77      3821
         Low       0.58      0.65      0.61      1450
    Moderate       0.57      0.60      0.58      2906

    accuracy                           0.67      8177
   macro avg       0.65      0.66      0.66      8177
weighted avg       0.68      0.67      0.68      8177



In [13]:
# Step 12: Save the model and encoders
joblib.dump((model, X_encoded.columns, label_encoder), "model_xgb_missing.pkl")
print("\n✅ Model saved as model_xgb_missing.pkl")


✅ Model saved as model_xgb_missing.pkl


In [None]:
# Group by StudyID and count distinct Year or Age values
multi_visits = df.groupby("StudyID").agg({
    "Year": pd.Series.nunique,
    "Age": pd.Series.nunique
})

# Filter those with more than one unique Year or Age
multi_visits = multi_visits[(multi_visits["Year"] > 1) | (multi_visits["Age"] > 1)]

# Display a few example patient IDs
print("✅ Patients with multiple visits (different Year or Age):")
print(multi_visits.head(10).index.tolist())

In [None]:
# Show all columns for patient P1003018
df[df["StudyID"] == "P1003018"]


In [None]:
import pandas as pd

# Load your data
df = pd.read_csv("pivoted.csv")

# Drop rows where target is missing
df = df[df["Patient's Caries Risk Status"].notna()]

# Get count of each class
class_counts = df["Patient's Caries Risk Status"].value_counts()

# Display result
print("📊 Caries Risk Level Counts:")
print(class_counts)