In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load dataset
df = pd.read_csv('Dataset 2 _ Early-stage diabetes risk prediction dataset (ESDRPD).csv')

# Ensure no missing target
df = df.dropna(subset=['class'])

# Clean categorical columns by replacing unexpected or missing with 'No'
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna('No')
        df[col] = df[col].str.strip().replace('', 'No')  # Handle blank entries

# Convert categorical to numeric
X = df.drop('class', axis=1)
X = X.apply(lambda col: col.map({'Yes': 1, 'No': 0}) if col.dtypes == 'object' else col)

# After conversion, fill any remaining NaN (due to unexpected values not captured above)
X = X.fillna(0)

# Confirm no NaN remains
print("Any NaN in X after processing?", X.isnull().sum().sum())

y = df['class']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Training
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# Predictions and Evaluation
y_pred = model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Any NaN in X after processing? 0
Accuracy: 0.9134615384615384
Confusion Matrix:
 [[38  2]
 [ 7 57]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.95      0.89        40
           1       0.97      0.89      0.93        64

    accuracy                           0.91       104
   macro avg       0.91      0.92      0.91       104
weighted avg       0.92      0.91      0.91       104



In [7]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Dataset 2 _ Early-stage diabetes risk prediction dataset (ESDRPD).csv')

# Check column names carefully
print(df.columns)

# Display first few records to confirm target column name
print(df.head())

# Check for nulls in all columns
print(df.isnull().sum())

# Check unique values in columns to find target column
for col in df.columns:
    print(f"Unique values in {col}: {df[col].unique()}")


Index(['age', 'gender', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'itching', 'irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'alopecia', 'obesity', 'class'],
      dtype='object')
   age gender polyuria polydipsia sudden weight loss weakness polyphagia  \
0   40   Male       No        Yes                 No      Yes         No   
1   58   Male       No         No                 No      Yes         No   
2   41   Male      Yes         No                 No      Yes        Yes   
3   45   Male       No         No                Yes      Yes        Yes   
4   60   Male      Yes        Yes                Yes      Yes        Yes   

  genital thrush visual blurring itching irritability delayed healing  \
0             No              No     Yes           No             Yes   
1             No             Yes      No           No              No   
2             No            

In [3]:
print(df['class'].isnull().sum())


520


In [11]:
import joblib

# Save the trained model
joblib.dump(model, 'diabetes_model.pkl')

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

# If you used a selector (example: SelectKBest, RFE, etc.)
# joblib.dump(selector, 'selector.pkl')


['scaler.pkl']