# Import the Data analytics Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
from sklearn.preprocessing import MinMaxScaler

In [5]:
from sklearn.svm import SVC

In [6]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [7]:
from sklearn.feature_selection import RFE

# Load the whole dataset

In [8]:
df = pd.read_csv('F:/Team Tech Flu/DDS MODEL/Dataset/diabetes.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'F:/Team Tech Flu/DDS MODEL/Dataset/diabetes.csv'

In [None]:
df

# Analysis the data to check the null value

In [None]:
df.isnull().sum()

# Information about the data

In [None]:
df.info()

In [None]:
df.head()

# Check count, mean, std, min and more things about data

In [None]:
df.describe()

# Divide input columns and output columns

In [None]:
X = df.drop(columns='Outcome')
Y = df['Outcome']

In [None]:
X

In [None]:
Y

# Standardization: Standardize the features.

In [None]:
# Normalize all features using Min-Max Scaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split: Split the dataset into training and testing sets

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [None]:
def remove_outliers_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    # Define bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Remove outliers
    return data[~((data < lower_bound) | (data > upper_bound)).any(axis=1)]

# Remove outliers
diabetes_data_cleaned = remove_outliers_iqr(df)

In [None]:
from sklearn.decomposition import PCA

In [None]:
# Apply PCA
pca = PCA(n_components=5)  # You can choose the number of components
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Feature Engineering:
#### Feature Selection using RFE: Use Recursive Feature Elimination to select the best features.

In [None]:
model = SVC(kernel='linear')

In [None]:
rfe = RFE(estimator=model, n_features_to_select=5)

In [None]:
fit = rfe.fit(X_train_pca, Y_train)

In [None]:
# Get the selected features
selected_features = np.arange(X_train_pca.shape[1])[fit.support_]
print("Selected PCA Features: ", selected_features)

In [None]:
# Fit the model
classifier = SVC(kernel='linear')
classifier.fit(X_train_pca[:, fit.support_], Y_train)

In [None]:
# Make Predictions
Y_pred = classifier.predict(X_test_pca[:, fit.support_])

In [None]:
# Evaluate the Model
confusion = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", classification_report(Y_test, Y_pred))
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()