In [None]:
#importing libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
#scikit library
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier #first letter shoulld be in Capitalised
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,r2_score,mean_absolute_error,mean_squared_error
from sklearn.preprocessing import StandardScaler
import joblib

print("Libraries are Imported !")

In [None]:
#loading dataset 
df=pd.read_csv("diabetes.csv")
print(df.head())
print(df.info())
print(df.isnull().sum())
print((df==0).sum)

In [None]:
#load the dataset
df = pd.read_csv('diabetes.csv')

#columns where 0 is invalid
cols_with_zero_invalid = ['Glucose', 'BloodPressure', 'BMI', 'Age','DiabetesPedigreeFunction','Pregnancies','SkinThickness','Insulin']

#replace 0s with NaN
df[cols_with_zero_invalid] = df[cols_with_zero_invalid].replace(0, pd.NA)

# Convert to float (ensures median works properly

# Fill NaNs with median values
df.fillna(df[cols_with_zero_invalid].median().infer_objects(copy=False), inplace=True)


# Drop unwanted columns
#columns_to_drop = ['DiabetesPedigreeFunction', 'Pregnancies', 'SkinThickness', 'Insulin']
#df.drop(columns=columns_to_drop, axis=1, inplace=True, errors='ignore')

#final column status
print("Remaining columns:", df.columns.tolist())
print("Missing values:\n", df.isna().sum())

#split into Features(x) and Target(y)
X = df.drop('Outcome', axis=1)
y = df['Outcome']
print("Data prepared successfully.")
print("X shape:", X.shape)
print("y shape:", y.shape)


In [None]:

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Data Are Splitted ")


In [None]:
# Scaling of SVM For Decision Boundaries
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
#Logic of Randome Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

In [None]:
#Logic Of SVM
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train_scaled, y_train)
svm_preds = svm_model.predict(X_test_scaled)

In [None]:
#accuracy testing
print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))
print("SVM Accuracy:", accuracy_score(y_test, svm_preds))

In [None]:
#classification report
print("\nRandom Forest Classification Report:\n", classification_report(y_test, rf_preds))
print("\nSVM Classification Report:\n", classification_report(y_test, svm_preds))
#precision - actual correct
#recall - correctly identifiable
#F1-Score - balance of precision and recall
#support - no.of true instances

In [None]:
#evaluation metrics
print("\nRandom Forest Regression Metrics:")
print("R2 Score:", r2_score(y_test, rf_preds)) #variance
print("MAE:", mean_absolute_error(y_test, rf_preds)) #avg.no.of.wrong prediction
print("MSE:", mean_squared_error(y_test, rf_preds)) #larger errors

print("\nSVM Regression Metrics:")
print("R2 Score:", r2_score(y_test, svm_preds))
print("MAE:", mean_absolute_error(y_test, svm_preds))
print("MSE:", mean_squared_error(y_test, svm_preds))

In [None]:
#-----------------------------------#Visualizations#-----------------------------------------
# Confusion Matrix Plot Function
def plot_conf_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['No Diabetes', 'Diabetes'],
                yticklabels=['No Diabetes', 'Diabetes'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(title)
    plt.tight_layout()
    plt.show()



# Confusion Matrices
plot_conf_matrix(y_test, rf_preds, "Random Forest Confusion Matrix")
plot_conf_matrix(y_test, svm_preds, "SVM Confusion Matrix")

In [None]:
# accuracy plotting
model_names = ['Random Forest', 'SVM']
accuracies = [
    accuracy_score(y_test, rf_preds),
    accuracy_score(y_test, svm_preds)
]

plt.bar(model_names, accuracies, color=['skyblue', 'lightgreen'])
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.show()

In [None]:
#----------------------------------------MAnual Testing---------------------------------------#
print("Enter the following values to predict diabetes status:")

#user inputs
glucose = float(input("Glucose Level: "))
bp = float(input("Blood Pressure: "))
bmi = float(input("BMI: "))
age = float(input("Age: "))

#single input row
user_data = [[glucose, bp, bmi, age]]

#input for SVM
user_data_scaled = scaler.transform(user_data)

# Predict with both models
rf_result = rf_model.predict(user_data)
svm_result = svm_model.predict(user_data_scaled)

# Display predictions
print("\n--- Prediction Results ---")
print("Random Forest:", "Diabetic" if rf_result[0] == 1 else "Not Diabetic")
print("SVM           :", "Diabetic" if svm_result[0] == 1 else "Not Diabetic")


In [None]:
#Saving file as Pickle
joblib.dump(svm_model, "models/svm_model.pkl")
joblib.dump(rf_model, "models/random_forest_model.pkl")