In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('diabetes.csv')


In [None]:
# Check the dimensions of the dataset
print("Shape of the dataset:", df.shape)


In [None]:

# Check for missing values
print("Missing values:\n", df.isnull().sum())


In [None]:
# Statistical summary of numerical features
print("Statistical summary:\n", df.describe())


In [None]:
# Replace missing values with mean
df.fillna(df.mean(), inplace=True)


In [None]:
# Split the dataset into features and target variable
X = df.drop('Outcome', axis=1)
y = df['Outcome']


In [None]:
from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets (e.g., 70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
from sklearn.preprocessing import StandardScaler
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training set
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing set
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report


In [None]:
# Initialize models
logistic_regression = LogisticRegression()
random_forest = RandomForestClassifier()
svm = SVC()

In [None]:
# Train the models
logistic_regression.fit(X_train_scaled, y_train)
random_forest.fit(X_train_scaled, y_train)
svm.fit(X_train_scaled, y_train)

In [None]:

# Predictions
logistic_regression_preds = logistic_regression.predict(X_test_scaled)
random_forest_preds = random_forest.predict(X_test_scaled)
svm_preds = svm.predict(X_test_scaled)

In [None]:
from sklearn.metrics import classification_report

# Evaluate performance
print("Logistic Regression:")
print(classification_report(y_test, logistic_regression_preds))

print("Random Forest:")
print(classification_report(y_test, random_forest_preds))

print("SVM:")
print(classification_report(y_test, svm_preds))


In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters for grid search
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
}

# Perform grid search with cross-validation
svm_grid_search = GridSearchCV(SVC(), param_grid, cv=5, n_jobs=-1)
svm_grid_search.fit(X_train_scaled, y_train)

# Get best hyperparameters
best_params = svm_grid_search.best_params_
print("Best Hyperparameters:", best_params)#Best Hyperparameters: {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}


In [None]:
# Re-train SVM with best hyperparameters
svm_best = SVC(**best_params)
svm_best.fit(X_train_scaled, y_train)

best_preds = svm_best. predict(X_test_scaled) 
print(classification_report (y_test, best_preds))