In [1]:
# Data manipulation and analysis
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Utility
import pickle
import warnings

In [2]:
# Set up random seed
random_seed=42

# Load the dataset 
df = pd.read_csv("diagnostics_data.csv")
df

Unnamed: 0,itching,skin rash,nodal skin eruptions,continuous sneezing,shivering,chills,joint pain,stomach pain,acidity,ulcers on tongue,...,blackheads,scurring,skin peeling,silver like dusting,small dents in nails,inflammatory nails,blister,red sore around nose,yellow crust ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,(vertigo) Paroymsal Positional Vertigo
4916,0,1,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,Acne
4917,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Urinary tract infection
4918,0,1,0,0,0,0,1,0,0,0,...,0,0,1,1,1,1,0,0,0,Psoriasis


In [3]:
# Prepare the feature and target
X = df.drop(["prognosis"], axis=1)  
y = df['prognosis']

# Set up stratified cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_seed)

# Initialize model
lr = LogisticRegression(max_iter=1000, class_weight='balanced')

# Lists to store the evaluation metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Cross-validation loop
for train_index, val_index in skf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Fit model
    lr.fit(X_train, y_train)
    
    # Predictions
    y_pred = lr.predict(X_val)
    
    # Calculate metrics for this fold
    accuracy_scores.append(accuracy_score(y_val, y_pred))
    
    # Weighted classification report (with average='weighted' for multiclass)
    report = classification_report(y_val, y_pred, output_dict=True, zero_division=0)
    precision_scores.append(report['weighted avg']['precision'])
    recall_scores.append(report['weighted avg']['recall'])
    f1_scores.append(report['weighted avg']['f1-score'])

# Print average metrics
print(f"Average Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Average Precision: {np.mean(precision_scores):.4f}")
print(f"Average Recall: {np.mean(recall_scores):.4f}")
print(f"Average F1-Score: {np.mean(f1_scores):.4f}")

Average Accuracy: 1.0000
Average Precision: 1.0000
Average Recall: 1.0000
Average F1-Score: 1.0000


In [5]:
# Train on full dataset
lr = LogisticRegression()
lr.fit(X, y)

# Save the trained pipeline model
pickle.dump(lr, open('disease_predictor.pkl', 'wb'))