In [64]:
import pandas as pd

In [65]:
# Specify the file name and optionally the sheet name or index
file_name = '/Users/shawnpana/Documents/GitHub/sushihacks2025/backend/41598_2021_97043_MOESM2_ESM.xlsx'
df = pd.read_excel(file_name) # Reads the sheet named 'Sheet1'
# Or to read the first sheet by index:
# df = pd.read_excel(file_name, sheet_name=0)

In [66]:
# print(df.columns)
# print the number of rows in this dataframe
# print(len(df))

# make the columns lowercase
df.columns = [col.lower() for col in df.columns]
print(df.columns)

Index(['severity', 'mild', 'mod', 'sev', 'age', 'bmi', 'csa', 'pb', 'duration',
       'nrs', 'sex', 'side', 'diabetes', 'np', 'weakness'],
      dtype='object')


In [67]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

Unnamed: 0,severity,mild,mod,sev,age,bmi,csa,pb,duration,nrs,sex,side,diabetes,np,weakness
0,mild,mild,others,others,65,21.093750,10.0,1.9,3,2,1,1,0,0,0
1,severe,others,others,severe,68,20.415225,13.0,1.8,12,6,0,0,0,0,0
2,mild,mild,others,others,68,20.415225,13.0,1.5,12,3,0,1,0,0,0
3,mild,mild,others,others,63,28.507522,13.0,2.9,1,4,1,1,0,0,0
4,mild,mild,others,others,87,22.939751,12.0,2.0,1,2,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1032,mild,mild,others,others,60,22.189349,11.0,1.8,2,3,1,1,0,0,0
1033,severe,others,others,severe,49,24.196494,16.0,3.1,8,5,1,0,0,1,1
1034,mild,mild,others,others,41,23.808690,12.0,1.7,4,4,0,0,0,0,0
1035,mild,mild,others,others,67,26.986001,11.0,3.9,8,3,1,1,0,0,0


In [69]:
def to_numeric_severity(severity):
    if severity == 'mild':
        return 0
    elif severity == 'moderate':
        return 1
    elif severity == 'severe':
        return 2
    else:
        return -1  # Unknown severity
    


In [70]:
df['severity_numeric'] = df['severity'].apply(to_numeric_severity)

In [71]:
df.head()
df.drop(['severity', 'mild', 'mod', 'sev'], axis=1, inplace=True)

In [72]:
df.head()

Unnamed: 0,age,bmi,csa,pb,duration,nrs,sex,side,diabetes,np,weakness,severity_numeric
0,65,21.09375,10.0,1.9,3,2,1,1,0,0,0,0
1,68,20.415225,13.0,1.8,12,6,0,0,0,0,0,2
2,68,20.415225,13.0,1.5,12,3,0,1,0,0,0,0
3,63,28.507522,13.0,2.9,1,4,1,1,0,0,0,0
4,87,22.939751,12.0,2.0,1,2,1,0,0,0,0,0


In [73]:
df.dtypes

age                   int64
bmi                 float64
csa                 float64
pb                  float64
duration              int64
nrs                   int64
sex                   int64
side                  int64
diabetes              int64
np                    int64
weakness              int64
severity_numeric      int64
dtype: object

In [75]:
from sklearn.metrics import classification_report, confusion_matrix

target_col = "severity_numeric"

X = df.drop(columns=[target_col])
y = df[target_col].astype(int)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=2/3, stratify=y_temp, random_state=42
)

model = XGBClassifier(
    objective="multi:softprob",  # probabilities for each class
    num_class=3,
    eval_metric=["mlogloss", "merror"],
    n_estimators=3000,           # large cap + early stopping will pick the best
    learning_rate=0.05,          # smaller LR + early stopping is robust
    max_depth=6,                 # tuneable
    min_child_weight=2,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    tree_method="hist",          # fast
    random_state=42,
    n_jobs=-1,
)

model.fit(
    X_train, y_train,
    # sample_weight=sample_weight,
    eval_set=[(X_valid, y_valid)],
    verbose=False,
    # early_stopping_rounds=100
)

y_pred = model.predict(X_test)
print("\nClassification report (test):")
print(classification_report(y_test, y_pred, digits=4))

cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm)



Classification report (test):
              precision    recall  f1-score   support

           0     0.7913    0.8922    0.8387       102
           1     0.6744    0.5273    0.5918        55
           2     0.8400    0.8235    0.8317        51

    accuracy                         0.7788       208
   macro avg     0.7686    0.7477    0.7541       208
weighted avg     0.7723    0.7788    0.7717       208

Confusion matrix:
 [[91 10  1]
 [19 29  7]
 [ 5  4 42]]


In [None]:
new_data = pd.DataFrame([{
    "age": 65,
    "bmi": 23.530366,
    "csa": 15.2,
    "pb": 60,
    "duration": 0,
    "nrs": 7,
    "sex": 0,        # e.g. male=1, female=0 (depends how you encoded it)
    "side": 0,       # left/right encoding
    "diabetes": 1,
    "np": 1,
    "weakness": 0
}])

pred_class = model.predict(new_data)

pred_class


array([1])

In [87]:
df[df['severity_numeric'] == 2]

Unnamed: 0,age,bmi,csa,pb,duration,nrs,sex,side,diabetes,np,weakness,severity_numeric
1,68,20.415225,13.0,1.8,12,6,0,0,0,0,0,2
7,64,24.919900,12.0,4.5,3,4,1,0,0,0,0,2
8,64,24.919900,15.0,3.4,3,4,1,1,0,1,0,2
26,58,25.558846,18.0,2.8,5,7,1,1,0,1,1,2
38,76,26.395803,13.0,2.3,2,7,0,0,0,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1021,56,23.530366,27.0,3.9,6,7,1,0,1,1,0,2
1023,37,27.041644,23.0,2.4,12,9,0,0,0,1,1,2
1027,64,26.666667,19.0,2.4,12,5,1,0,0,1,1,2
1029,71,29.242109,21.0,4.8,24,6,0,0,1,1,1,2
