# Baseline model for batch

In [1]:
import requests
import pickle
import pandas as pd



from joblib import load, dump
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

In [2]:
df = pd.read_csv('./data/Dataset_spine.csv')
df.head()

Unnamed: 0,pelvic incidence,pelvic tilt,lumbar lordosis angle,sacral slope,pelvic radius,grade of spondylolisthesis,Class_att
0,63.027817,22.552586,39.609117,40.475232,98.672917,-0.2544,Abnormal
1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259,Abnormal
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,Abnormal
3,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523,Abnormal
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,Abnormal


In [3]:
df_shuffled = df.sample(frac=1, random_state=42) 
df_shuffled.head(20)

Unnamed: 0,pelvic incidence,pelvic tilt,lumbar lordosis angle,sacral slope,pelvic radius,grade of spondylolisthesis,Class_att
289,44.430701,14.174264,32.243495,30.256437,131.717613,-3.604255,Normal
9,36.686353,5.010884,41.948751,31.675469,84.241415,0.664437,Abnormal
57,46.855781,15.351514,38.0,31.504267,116.250917,1.662706,Abnormal
60,74.377678,32.053104,78.772013,42.324573,143.56069,56.125906,Abnormal
25,54.12492,26.650489,35.329747,27.474432,121.447011,1.571205,Abnormal
63,77.690577,21.380645,64.429442,56.309932,114.818751,26.931841,Abnormal
92,85.352315,15.84491,71.66866,69.507405,124.419787,76.020603,Abnormal
184,81.056611,20.801492,91.784495,60.255119,125.430176,38.181782,Abnormal
244,63.0263,27.33624,51.605017,35.69006,114.506608,7.43987,Normal
46,48.332638,22.227784,36.181993,26.104854,117.384625,6.481709,Abnormal


In [4]:
df_shuffled = df_shuffled.reset_index(drop=True)
df_shuffled.head(20)

Unnamed: 0,pelvic incidence,pelvic tilt,lumbar lordosis angle,sacral slope,pelvic radius,grade of spondylolisthesis,Class_att
0,44.430701,14.174264,32.243495,30.256437,131.717613,-3.604255,Normal
1,36.686353,5.010884,41.948751,31.675469,84.241415,0.664437,Abnormal
2,46.855781,15.351514,38.0,31.504267,116.250917,1.662706,Abnormal
3,74.377678,32.053104,78.772013,42.324573,143.56069,56.125906,Abnormal
4,54.12492,26.650489,35.329747,27.474432,121.447011,1.571205,Abnormal
5,77.690577,21.380645,64.429442,56.309932,114.818751,26.931841,Abnormal
6,85.352315,15.84491,71.66866,69.507405,124.419787,76.020603,Abnormal
7,81.056611,20.801492,91.784495,60.255119,125.430176,38.181782,Abnormal
8,63.0263,27.33624,51.605017,35.69006,114.506608,7.43987,Normal
9,48.332638,22.227784,36.181993,26.104854,117.384625,6.481709,Abnormal


In [5]:
df_shuffled.columns = df_shuffled.columns.str.lower().str.replace(' ', '_')

In [7]:
df_shuffled['class_att'] = df_shuffled['class_att'].map({'Abnormal': 0, 'Normal': 1}).astype(int)

In [12]:
X = df_shuffled.iloc[:, :-1].values
y = df_shuffled.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

# Save to disk
with open('./data/X_train.pkl', 'wb') as f:
    pickle.dump(X_train, f)
with open('./data/X_test.pkl', 'wb') as f:
    pickle.dump(X_test, f)
with open('./data/y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)
with open('./data/y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)

In [13]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predictions
y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

# Calculate metrics
metrics = {
    "test_accuracy": accuracy_score(y_test, y_pred),
    "test_auc": roc_auc_score(y_test, y_proba),
    "test_f1": f1_score(y_test, y_pred)
}

In [14]:
print(metrics)

{'test_accuracy': 0.8225806451612904, 'test_auc': 0.9142857142857144, 'test_f1': 0.717948717948718}


In [17]:
feature_names = df_shuffled.columns[:6]
feature_names

Index(['pelvic_incidence', '_pelvic_tilt', '_lumbar_lordosis_angle',
       '_sacral_slope', '_pelvic_radius', '_grade_of_spondylolisthesis'],
      dtype='object')

In [18]:
# 1. Ensure X_test is a DataFrame (if it's a NumPy array from train_test_split)
#    If your X was already a DataFrame, X_test will also be a DataFrame.
if not isinstance(X_test, pd.DataFrame):
    # Create feature names for clarity, e.g., 'feature_0', 'feature_1', etc.
    #feature_names = [f'feature_{i}' for i in range(X_test.shape[1])]
    feature_names = df_shuffled.columns[:6]
    X_test_df = pd.DataFrame(X_test, columns=feature_names)
else:
    X_test_df = X_test.copy() # Use a copy to avoid modifying the original X_test

# 2. Convert y_test and y_pred to Pandas Series for easy concatenation
y_test_series = pd.Series(y_test, name='true_label', index=X_test_df.index)
y_pred_series = pd.Series(y_pred, name='predicted_label', index=X_test_df.index)

# 3. Concatenate them horizontally
#    We use axis=1 to concatenate columns
combined_df = pd.concat([X_test_df, y_test_series, y_pred_series], axis=1)

print("Combined DataFrame with X_test, y_test, and y_pred:")
print(combined_df.head()) # Display the first few rows
print(f"\nShape of the combined DataFrame: {combined_df.shape}")

Combined DataFrame with X_test, y_test, and y_pred:
   pelvic_incidence  _pelvic_tilt  _lumbar_lordosis_angle  _sacral_slope  \
0         38.505273     16.964297               35.112814      21.540976   
1         38.663257     12.986441               40.000000      25.676816   
2         70.399308     13.469986               61.200000      56.929322   
3         42.021386     -6.554948               67.900000      48.576334   
4         65.536003     24.157487               45.775170      41.378515   

   _pelvic_radius  _grade_of_spondylolisthesis  true_label  predicted_label  
0      127.632875                     7.986683           1                0  
1      124.914118                     2.703008           0                0  
2      102.337524                    25.538429           0                0  
3      111.585782                    27.338671           0                0  
4      136.440302                    16.378086           0                0  

Shape of the combined 

In [19]:
combined_df.head()

Unnamed: 0,pelvic_incidence,_pelvic_tilt,_lumbar_lordosis_angle,_sacral_slope,_pelvic_radius,_grade_of_spondylolisthesis,true_label,predicted_label
0,38.505273,16.964297,35.112814,21.540976,127.632875,7.986683,1,0
1,38.663257,12.986441,40.0,25.676816,124.914118,2.703008,0,0
2,70.399308,13.469986,61.2,56.929322,102.337524,25.538429,0,0
3,42.021386,-6.554948,67.9,48.576334,111.585782,27.338671,0,0
4,65.536003,24.157487,45.77517,41.378515,136.440302,16.378086,0,0


# dump model and reference data

In [20]:
with open('./models/log_reg.bin', 'wb') as f_out:
    dump(log_reg, f_out)

In [21]:
combined_df.to_csv('./data/reference.csv', index=False)