# OsteoID.ai — Clavicle Classifier (Final)
**Kevin P. Klier | University at Buffalo BHEML**

- 185 clavicles
- 7 landmarks
- Full-power 1000-tree models

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from scipy.spatial import procrustes
from imblearn.over_sampling import SMOTE
import pickle
import os

# BULLETPROOF PARSER
def load_morphofile(filepath):
    names = []
    landmarks = []
    current_name = None
    current_lms = []
    
    with open(filepath, 'r') as f:
        lines = f.readlines()
    
    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue
        if stripped.startswith('#') or stripped.startswith("'#"):
            if current_name and current_lms:
                landmarks.append(np.array(current_lms))
                names.append(current_name)
            current_name = stripped.strip("#' ")
            current_lms = []
            continue
        try:
            coords = [float(x) for x in stripped.split()]
            if len(coords) == 3:
                current_lms.append(coords)
        except:
            pass
    
    if current_name and current_lms:
        landmarks.append(np.array(current_lms))
        names.append(current_name)
    
    return names, np.stack(landmarks)

# ←←← CHANGE THIS TO YOUR ACTUAL FILENAME AFTER UPLOADING
names, landmarks = load_morphofile('MorphoFileClavicle_ANHP.txt')
print(f"Loaded {len(names)} clavicles")

In [None]:
def parse_name(name):
    parts = name.split('_')
    bone_idx = parts.index('clavicle')
    sex = parts[bone_idx - 1]
    side = parts[-1][-1]
    genus_species = '_'.join(parts[1:bone_idx-1])
    return genus_species, sex, side

species_list, sex_list, side_list = [], [], []
for n in names:
    sp, sex, side = parse_name(n)
    species_list.append(sp)
    sex_list.append(sex)
    side_list.append(side)

print(f"Sex: {sex_list.count('M')} M, {sex_list.count('F')} F")
print(f"Side: {side_list.count('L')} L, {side_list.count('R')} R")

In [None]:
# GPA + PCA
mean_shape = np.mean(landmarks, axis=0)
aligned = np.zeros_like(landmarks)
for i in range(len(landmarks)):
    _, aligned[i], _ = procrustes(mean_shape, landmarks[i])

flat = aligned.reshape(len(landmarks), -1)
pca = PCA(n_components=10)
features = pca.fit_transform(flat)

In [None]:
# FULL-POWER MODELS
le_species = LabelEncoder()
y_species = le_species.fit_transform(species_list)

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(features, y_species)

model_species = RandomForestClassifier(
    n_estimators=1000,
    max_depth=None,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
model_species.fit(X_res, y_res)

model_sex = RandomForestClassifier(class_weight='balanced', random_state=42)
model_sex.fit(features, sex_list)

model_side = RandomForestClassifier(class_weight='balanced', random_state=42)
model_side.fit(features, side_list)

print("All clavicle models trained!")

In [None]:
# FINAL SAVE — automatically creates correct folder structure
bone = "clavicle"
os.makedirs(f"models/{bone}", exist_ok=True)

with open(f"models/{bone}/model_sex_{bone}.pkl", 'wb') as f: pickle.dump(model_sex, f)
with open(f"models/{bone}/model_side_{bone}.pkl", 'wb') as f: pickle.dump(model_side, f)
with open(f"models/{bone}/model_species_{bone}.pkl", 'wb') as f: pickle.dump(model_species, f)
with open(f"models/{bone}/le_species_{bone}.pkl", 'wb') as f: pickle.dump(le_species, f)
with open(f"models/{bone}/pca_{bone}.pkl", 'wb') as f: pickle.dump(pca, f)
with open(f"models/{bone}/mean_shape_{bone}.pkl", 'wb') as f: pickle.dump(mean_shape, f)

print(f"All {bone} models saved correctly to models/{bone}/ — ready for GitHub + Streamlit!")