In [None]:
%pip install numpy scipy scikit-learn pandas joblib torch deap update_checker tqdm stopit xgboost 
%pip install dask[delayed] dask[dataframe] dask-ml 
%pip install scikit-mdr skrebate
%pip install tpot
%pip install seaborn matplotlib
%pip install setuptools

In [None]:
import pandas as pd
from scipy.io import arff

# Load the ARFF file
data, meta = arff.loadarff('dataset')

# Convert it into a pandas DataFrame
dataset = pd.DataFrame(data)

# Display the first few rows of the DataFrame
dataset.head()

# Data Exploration

In [None]:
# Check for any null values in the filtered dataset
null_data = dataset.isnull().sum()
null_data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Calculate the correlation matrix
corr = dataset.corr()

# Generate a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title('Correlation Heatmap of Diabetes Dataset')
plt.show()

In [None]:
# Extract the correlation values against 'Outcome'
outcome_corr = dataset.corr()['Outcome'].sort_values()

# Display the correlation values
outcome_corr

In [None]:
# Drop columns with correlation less than 0.15 against 'Outcome'
filtered_dataset = dataset.drop(outcome_corr[outcome_corr < 0.15].index, axis=1)

# Show the new dataframe information
filtered_dataset.info(), filtered_dataset.head()

In [None]:
# Create box plots for each feature to identify outliers
plt.figure(figsize=(12, 8))
sns.boxplot(data=filtered_dataset.drop(columns=["Outcome"]), palette="Set2")
plt.title('Box Plots for Numerical Features in the Diabetes Dataset')
plt.xticks(rotation=45)  # Rotating the x labels for better visibility
plt.show()

In [None]:
# Perform IQR capping on the previously filtered columns
Q1 = filtered_dataset.quantile(0.25)
Q3 = filtered_dataset.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Capping the outliers
capped_dataset = filtered_dataset.apply(lambda x: np.where(x < lower_bound[x.name], lower_bound[x.name], x), axis=0)
capped_dataset = capped_dataset.apply(lambda x: np.where(x > upper_bound[x.name], upper_bound[x.name], x), axis=0)

# Provide basic descriptive statistics to compare the effect
capped_dataset.describe()

In [None]:
# Create box plots for each feature to identify outliers
plt.figure(figsize=(12, 8))
sns.boxplot(data=capped_dataset.drop(columns=["Outcome"]), palette="Set2")
plt.title('Box Plots for Numerical Features in the Diabetes Dataset')
plt.xticks(rotation=45)  # Rotating the x labels for better visibility
plt.show()

In [None]:
# Creating multiple scatter plots to visualize relationships between different variables
fig, axs = plt.subplots(2, 2, figsize=(14, 12))

# Plot 1: Glucose vs. Outcome
sns.scatterplot(data=capped_dataset, x='Glucose', y='Outcome', ax=axs[0, 0], alpha=0.6)
axs[0, 0].set_title('Glucose Levels vs. Diabetes Outcome')
axs[0, 0].set_xlabel('Glucose')
axs[0, 0].set_ylabel('Outcome (0 = No Diabetes, 1 = Diabetes)')

# Plot 2: Age vs. Glucose
sns.scatterplot(data=capped_dataset, x='Age', y='Glucose', ax=axs[0, 1], alpha=0.6)
axs[0, 1].set_title('Age vs. Glucose Levels')
axs[0, 1].set_xlabel('Age')
axs[0, 1].set_ylabel('Glucose')

# Plot 3: BMI vs. Glucose
sns.scatterplot(data=capped_dataset, x='BMI', y='Glucose', ax=axs[1, 0], alpha=0.6)
axs[1, 0].set_title('BMI vs. Glucose Levels')
axs[1, 0].set_xlabel('BMI')
axs[1, 0].set_ylabel('Glucose')

# Plot 4: Pregnancies vs. Age
sns.scatterplot(data=capped_dataset, x='Pregnancies', y='Age', ax=axs[1, 1], alpha=0.6)
axs[1, 1].set_title('Pregnancies vs. Age')
axs[1, 1].set_xlabel('Pregnancies')
axs[1, 1].set_ylabel('Age')

plt.tight_layout()
plt.show()


# TPOT Implementation

In [None]:
from sklearn.model_selection import train_test_split

X = capped_dataset.drop('Outcome', axis=1)
Y = capped_dataset['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, test_size=0.2)


Parameters

In [None]:
generations=25
population_size=250
cv=10
mutation_rate=0.7
crossover_rate= (1.0 - mutation_rate)
random_state=42
verbosity=2
warm_start=False
n_jobs=12

In [None]:
# Printing values
print(f"Generations: {generations}")
print(f"Population Size: {population_size}")
print(f"CV Folds: {cv}")
print(f"Mutation Rate: {mutation_rate}")
print(f"Crossover Rate: {crossover_rate}")
print(f"Random State: {random_state}")
print(f"Verbosity Level: {verbosity}")
print(f"Warm Start: {warm_start}")
print(f"Number of Jobs: {n_jobs}")

In [None]:
from tpot import TPOTClassifier

pipeline_optimizer = TPOTClassifier(
    generations=generations,
    population_size=population_size,
    cv=cv,
    mutation_rate=mutation_rate,
    crossover_rate=crossover_rate,
    random_state=random_state,
    verbosity=verbosity,
    warm_start=warm_start,
    n_jobs=n_jobs  
)

# Fit the model
pipeline_optimizer.fit(X_train, y_train)

# Print the score on the test set
print(pipeline_optimizer.score(X_test, y_test))

# Export the optimized pipeline to a Python script
pipeline_optimizer.export('tpot_exported_pipeline_25_250_7.py')
