In [140]:
%pip install -r requirements.txt

Collecting pandas==2.2.1 (from -r requirements.txt (line 1))
  Using cached pandas-2.2.1.tar.gz (4.4 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [54 lines of output]
      Collecting meson-python==0.13.1
        Using cached meson_python-0.13.1-py3-none-any.whl.metadata (4.1 kB)
      Collecting meson==1.2.1
        Using cached meson-1.2.1-py3-none-any.whl.metadata (1.7 kB)
      Collecting wheel
        Using cached wheel-0.46.3-py3-none-any.whl.metadata (2.4 kB)
      Collecting Cython==3.0.5
        Using cached Cython-3.0.5-py2.py3-none-any.whl.metadata (3.2 kB)
      Collecting numpy<=2.0.0.dev0,>1.22.4
        Using cached numpy-1.26.4.tar.gz (15.8 MB)
        Installing build dependencies: started
        Installing build dependencies: finished with status 'done'
        Getting requirements to build wheel: started
        Getting requirements to build wheel: finished with status 'done'
        Installing backend dependencies: started
        Installing backend dependencies: finished

In [141]:
# Importing Libraries
import pandas as pd                                                 # for Data Frames
from sklearn.model_selection import train_test_split                # for train-test split
from sklearn.preprocessing import StandardScaler, LabelEncoder      # for data preprocessing
from sklearn.metrics import (                                       # for evaluation metrics
    accuracy_score, roc_auc_score,
    precision_score, recall_score, f1_score,
    matthews_corrcoef
)

In [142]:
# Importing the Dataset and Exploring
df = pd.read_csv("data.csv")
print("df Shape:", df.shape)

df Shape: (569, 33)


In [143]:
# Dropping unwanted Columns
df.drop(columns=['id', 'Unnamed: 32'], inplace=True)
print("df Shape after dropping columns:", df.shape)

# Encode target variable diagnosis
le = LabelEncoder()
df['diagnosis'] = le.fit_transform(df['diagnosis'])  # Transforming M=1, B=0

df Shape after dropping columns: (569, 31)


In [144]:
# Split the data (80% Train, 20% Test)
X = df.drop(columns=['diagnosis'])
y = df['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale (fit on train, apply to test)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Test data capturing for using later in metric calculations in Streamlit app
test_df = pd.DataFrame(X_test, columns=X.columns)  # Copy Test features
test_df['target'] = y_test.values                  # Add target variable to Test data

# Copy Test data to csv file in root
test_df.to_csv('test_data.csv', index=False)
print("Test data saved with shape:", test_df.shape)

Test data saved with shape: (114, 31)


In [145]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Initialize models
models = {
    "Logistic_Regression": LogisticRegression(),
    "Decision_Tree": DecisionTreeClassifier(random_state=42),
    "K_Nearest_Neighbor": KNeighborsClassifier(n_neighbors=5),
    "Naive_Bayes_Gaussian": GaussianNB(),
    "Random_Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=True, eval_metric='logloss')
}

results = {}

In [146]:
# Save Models in Directory
import os, re
import joblib
from sklearn.metrics import auc  

os.makedirs("model", exist_ok=True)                          # Create directory if not exists
def safe_filename(name: str) -> str:                         # Model name for filename
    return re.sub(r"[^A-Za-z0-9_-]+", "_", name).lower()

# Train and Evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    area_under_curve = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])  
    precision = precision_score(y_test, predictions, pos_label=1)
    recall = recall_score(y_test, predictions, pos_label=1)
    f1 = f1_score(y_test, predictions, pos_label=1)
    matthews_corrcoef_value = matthews_corrcoef(y_test, predictions)
    print(f"\n\n--- {name} ---")
    print(
    f"accuracy     : {accuracy:.4f}"
    f"\nauc          : {area_under_curve:.4f}"
    f"\nprecision    : {precision:.4f}"
    f"\nrecall       : {recall:.4f}"
    f"\nf1           : {f1:.4f}"
    f"\nmcc          : {matthews_corrcoef_value:.4f}"
    )
    out_path = os.path.join("model", f"{safe_filename(name)}.pkl")
    joblib.dump(model, out_path)
    print(f"Saved model → {out_path}")


    



--- Logistic_Regression ---
accuracy     : 0.9737
auc          : 0.9974
precision    : 0.9762
recall       : 0.9535
f1           : 0.9647
mcc          : 0.9439
Saved model → model\logistic_regression.pkl


--- Decision_Tree ---
accuracy     : 0.9474
auc          : 0.9440
precision    : 0.9302
recall       : 0.9302
f1           : 0.9302
mcc          : 0.8880
Saved model → model\decision_tree.pkl


--- K_Nearest_Neighbor ---
accuracy     : 0.9474
auc          : 0.9820
precision    : 0.9302
recall       : 0.9302
f1           : 0.9302
mcc          : 0.8880
Saved model → model\k_nearest_neighbor.pkl


--- Naive_Bayes_Gaussian ---
accuracy     : 0.9649
auc          : 0.9974
precision    : 0.9756
recall       : 0.9302
f1           : 0.9524
mcc          : 0.9253
Saved model → model\naive_bayes_gaussian.pkl


--- Random_Forest ---
accuracy     : 0.9649
auc          : 0.9953
precision    : 0.9756
recall       : 0.9302
f1           : 0.9524
mcc          : 0.9253
Saved model → model\random_fores

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
