In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
data=pd.read_csv("/content/diabetes_prediction_dataset.csv")

In [None]:
df=data.copy()
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
# prompt: # Handle duplicates
df.drop_duplicates(inplace=True)
df.duplicated().sum()

In [None]:
# each column and count the number of distinct values
for column in df.columns:
  print(f"Column: {column}, Distinct Values: {df[column].nunique()}")

In [None]:
df.describe().T

In [None]:
# Remove Unneccessary value [0.00195%]
df = df[df['gender'] != 'Other']

In [None]:
sns.countplot(x = 'diabetes',data = df)

In [None]:
import plotly.graph_objects as go

# Create a countplot for the 'diabetes' column using plotly.graph_objects
def create_diabetes_countplot(data):
    # Count the occurrences of each class in the 'diabetes' column
    counts = data['diabetes'].value_counts()

    # Create a bar chart
    fig = go.Figure(go.Bar(
        x=counts.index,  # Classes: 0 or 1 for 'diabetes'
        y=counts.values,  # Count of each class
        marker=dict(color=['#1f77b4', '#ff7f0e']),  # Different colors for each bar
        text=counts.values,  # Display counts on top of bars
        textposition='auto'  # Position the text automatically
    ))

    # Update layout for better aesthetics
    fig.update_layout(
        title='Diabetes Count Distribution',
        xaxis_title='Diabetes (0 = No, 1 = Yes)',
        yaxis_title='Count',
        template='plotly_white',
        title_font=dict(size=20)
    )

    # Show the figure
    fig.show()

# Call the function with your dataframe df
create_diabetes_countplot(df)


In [None]:
df.hist(figsize = (10,10))
plt.show()

In [None]:
import plotly.graph_objects as go

# Create a histogram for age distribution
fig = go.Figure()

# Add a histogram trace
fig.add_trace(go.Histogram(
    x=df['age'],
    nbinsx=30,
    marker=dict(color='#1f77b4'),  # A simple blue color
    name='Age Distribution'
))

# Update layout for better aesthetics, including bar gap
fig.update_layout(
    title='Age Distribution',
    xaxis_title='Age',
    yaxis_title='Count',
    template='plotly_white',  # Template for a clean look
    title_font=dict(size=20),  # Reasonable title font size
    bargap=0.2  # Adjust this value to increase/decrease space between bars
)

# Show the figure
fig.show()

In [None]:
import plotly.graph_objects as go
import numpy as np

# Create a histogram for age distribution using plotly.graph_objects
def create_age_distribution_plot(data):
    fig = go.Figure()

    # Add histogram for the 'age' variable, changed 'age' to 'Age'
    fig.add_trace(go.Histogram(
        x=df['age'], # Changed 'age' to 'Age'
        nbinsx=30,
        marker=dict(color='#1f77b4'),
        name='Age Distribution',
        opacity=0.7
    ))

    # Add KDE curve, changed 'age' to 'Age' everywhere
    x_values = np.linspace(data['age'].min(), data['age'].max(), 100) # Changed 'age' to 'Age'
    kde_y = np.exp(-0.5 * ((x_values - data['age'].mean()) / data['age'].std()) ** 2) / (data['age'].std() * np.sqrt(2 * np.pi)) # Changed 'age' to 'Age'

    # Scale the KDE line to match the histogram, changed 'age' to 'Age'
    fig.add_trace(go.Scatter(
        x=x_values,
        y=kde_y * (data['age'].count() * (max(data['age']) - min(data['age'])) / 30),  # Scale the KDE line and changed 'age' to 'Age'
        mode='lines',
        name='KDE',
        line=dict(color='#ff7f0e', width=2)
    ))

    # Update layout for better aesthetics
    fig.update_layout(
        title='Age Distribution',
        xaxis_title='Age',
        yaxis_title='Count',
        template='plotly_white',
        title_font=dict(size=20),
        bargap=0.2
    )

    # Show the figure
    fig.show()

# Call the function with your dataframe df
create_age_distribution_plot(df)

In [None]:
import plotly.graph_objects as go
import numpy as np

# Create a figure
fig = go.Figure()

# Add histogram for BMI
fig.add_trace(go.Histogram(
    x=df['bmi'],
    nbinsx=30,  # Number of bins for the histogram
    marker=dict(color='#1f77b4'),  # Color for histogram bars
    name='BMI Distribution',
    opacity=0.7
))

# Add a KDE curve
# Generate x values for the KDE
x_values = np.linspace(df['bmi'].min(), df['bmi'].max(), 100)
kde_y = np.exp(-0.5 * ((x_values - df['bmi'].mean()) / df['bmi'].std()) ** 2) / (df['bmi'].std() * np.sqrt(2 * np.pi))

# Add the KDE line
fig.add_trace(go.Scatter(
    x=x_values,
    y=kde_y * (df['bmi'].count() * (max(df['bmi']) - min(df['bmi'])) / 30),  # Scale the KDE line
    mode='lines',
    name='KDE',
    line=dict(color='#ff7f0e', width=2)  # Color and width for KDE line
))

# Update layout for better aesthetics
fig.update_layout(
    title='BMI Distribution',
    xaxis_title='BMI',
    yaxis_title='Count',
    template='plotly_white',  # Template for a clean look
    title_font=dict(size=20),  # Reasonable title font size
    bargap=0.2  # Space between bars
)

# Show the figure
fig.show()


In [None]:
sns.countplot(x='gender', data=df)
plt.title('Gender Distribution')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Stil təyin edirik
sns.set_style('whitegrid')  # Səliqəli fon

# Estetik vizuallar üçün rəng palitrası
palette = sns.color_palette("Set2")

# Binary dəyişənlərin countplot-u
for col in ['hypertension', 'heart_disease', 'diabetes']:
    plt.figure(figsize=(8, 4))  # Bütün qrafiklər üçün eyni ölçü
    sns.countplot(x=col, data=df, palette=palette)
    plt.title(f'{col.capitalize()} Distribution', fontsize=14, weight='bold')
    plt.xlabel(col.capitalize(), fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.xticks(fontsize=11)
    plt.yticks(fontsize=11)
    plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Stil təyin edirik
sns.set_style('whitegrid')

# Cəlbedici rəng palitrası
palette = sns.color_palette("Set3")

# Smoking history üçün countplot
plt.figure(figsize=(10, 6))  # Qrafik ölçüsü
sns.countplot(x='smoking_history', data=df, palette=palette)

# Başlıqlar və etiklər
plt.title('Smoking History Distribution', fontsize=16, weight='bold')
plt.xlabel('Smoking History', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)  # Fırlanma və hizalama
plt.yticks(fontsize=12)

# Plotları göstəririk
plt.tight_layout()  # Elementlərin bir-birinə yaxınlaşmaması üçün
plt.show()


In [None]:
df.info()

In [None]:
# prompt: # plot of gender vs diabetes

# Plot of gender vs diabetes
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', hue='diabetes', data=df)
plt.title('Diabetes Distribution by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Diabetes')
plt.show()

In [None]:

# Scatter plot of age vs. BMI colored by diabetes status
plt.figure(figsize=(8, 6))
sns.scatterplot(x='age', y='bmi', hue='diabetes', data=df, palette='viridis')
plt.title('Age vs. BMI (colored by Diabetes)')
plt.xlabel('Age')
plt.ylabel('BMI')
plt.show()


# Box plot of blood glucose level by diabetes status
plt.figure(figsize=(8, 6))
sns.boxplot(x='diabetes', y='blood_glucose_level', data=df, palette='Set2')
plt.title('Blood Glucose Level by Diabetes Status')
plt.xlabel('Diabetes')
plt.ylabel('Blood Glucose Level')
plt.show()


# Violin plot of HbA1c level by gender and diabetes status
plt.figure(figsize=(10, 6))
sns.violinplot(x='gender', y='HbA1c_level', hue='diabetes', data=df, palette='Set3', split=True)
plt.title('HbA1c Level by Gender and Diabetes Status')
plt.xlabel('Gender')
plt.ylabel('HbA1c Level')
plt.show()

In [None]:
# prompt: Descriptive Statistics

# Descriptive Statistics for Numerical Features
numerical_features = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
df[numerical_features].describe()

# Skewness and Kurtosis
print("Skewness:")
print(df[numerical_features].skew())
print("\nKurtosis:")
print(df[numerical_features].kurt())

# Frequency distribution of categorical features
for column in df.select_dtypes(include=['object']).columns:
  print(f"\nFrequency Distribution of {column}:")
  print(df[column].value_counts())

In [None]:
df.info()

In [None]:
import statsmodels.formula.api as sm
import statsmodels.api as sma # Importing the statsmodels.api

# Perform ANOVA for blood glucose level by diabetes status
model = sm.ols('blood_glucose_level ~ diabetes', data=df).fit()
anova_table = sma.stats.anova_lm(model, typ=2) # Using sma instead of sm to access stats
print(anova_table)

In [None]:
# Assuming 'df' is your DataFrame

# # Correlation Matrix Heatmap with Seaborn
# plt.figure(figsize=(12, 10))
# # Include numeric_only=True to select only numeric columns for correlation
# correlation_matrix = df.corr(numeric_only=True)
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
# plt.title('Correlation Matrix Heatmap', fontsize=16)
# plt.show()

# Correlation Matrix Heatmap with Plotly (More Interactive)
# Assuming you have imported 'plotly.graph_objects as go'
fig = go.Figure(data=go.Heatmap(
                   z=correlation_matrix.values,
                   x=correlation_matrix.columns,
                   y=correlation_matrix.columns,
                   colorscale='Viridis',  # Choose a suitable colorscale
                   text=correlation_matrix.values,
                   texttemplate="%{text:.2f}",
                   hoverinfo='x+y+z'))

fig.update_layout(
    title='Correlation Matrix Heatmap (Interactive)',
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    xaxis_zeroline=False,
    yaxis_zeroline=False,
    width=800,  # Adjust the width if needed
    height=600  # Adjust the height if needed
)

fig.show()

In [None]:
# prompt: . Check for Outliers and Distribution of Numerical Variables

# Identify numerical features
numerical_features = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']

# Box plots to visualize outliers
for feature in numerical_features:
  plt.figure(figsize=(8, 6))
  sns.boxplot(x=df[feature])
  plt.title(f'Box Plot of {feature}')
  plt.show()

# Histogram and Kernel Density Estimation (KDE) plots for distribution
for feature in numerical_features:
  plt.figure(figsize=(8, 6))
  sns.histplot(df[feature], kde=True)
  plt.title(f'Distribution of {feature}')
  plt.show()

In [None]:

# Calculate IQR to identify outliers more precisely
for feature in numerical_features:
  Q1 = df[feature].quantile(0.25)
  Q3 = df[feature].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
  print(f"Outliers for {feature}:")
  print(outliers)


In [None]:
# Check for outliers using a boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, palette='Set3')
plt.title('Outlier Detection')
plt.show()


In [None]:
import plotly.express as px
import pandas as pd

# Datasetin uzun formatda olmasını təmin edirik
df_melt = df.melt(var_name='Columns', value_name='Values')

# Bütün sütunlar üçün bir boxplot yaratmaq
fig = px.box(df_melt, x='Columns', y='Values', title="Boxplot of All Columns")
fig.show()

In [None]:
# IQR hesablanması
Q1 = df['blood_glucose_level'].quantile(0.25)
Q3 = df['blood_glucose_level'].quantile(0.75)
IQR = Q3 - Q1

# Alt və üst limitlərin hesablanması
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Outlier dəyərlərin tapılması
outliers = df[(df['blood_glucose_level'] < lower_bound) | (df['blood_glucose_level'] > upper_bound)]

print(f"Outlier dəyərlər:\n{outliers}")

In [None]:
median_glucose = df['blood_glucose_level'].median()
df['blood_glucose_level'] = np.where((df['blood_glucose_level'] < lower_bound) | (df['blood_glucose_level'] > upper_bound), median_glucose, df['blood_glucose_level'])


In [None]:
df.info()

In [None]:
# Define a function to map the existing categories to new ones
def recategorize_smoking(smoking_status):
    if smoking_status in ['never', 'No Info']:
        return 'non-smoker'
    elif smoking_status == 'current':
        return 'current'
    elif smoking_status in ['ever', 'former', 'not current']:
        return 'past_smoker'

# Apply the function to the 'smoking_history' column
df['smoking_history'] = df['smoking_history'].apply(recategorize_smoking)

# Check the new value counts
print(df['smoking_history'].value_counts())

In [None]:
def perform_one_hot_encoding(df, column_name):
    # Perform one-hot encoding on the specified column
    dummies = pd.get_dummies(df[column_name], prefix=column_name)

    # Drop the original column and append the new dummy columns to the dataframe
    df = pd.concat([df.drop(column_name, axis=1), dummies], axis=1)

    return df

# Perform one-hot encoding on the gender variable
data = perform_one_hot_encoding(data, 'gender')

# Perform one-hot encoding on the smoking history variable
data = perform_one_hot_encoding(data, 'smoking_history')

In [None]:
# prompt: 1. Class Imbalance and # Count plot for the 'diabetes' variable

# Class Imbalance and Count Plot for the 'diabetes' variable
diabetes_counts = df['diabetes'].value_counts()
print("Diabetes Class Counts:\n", diabetes_counts)

# Plotting the distribution of diabetes
plt.figure(figsize=(8, 6))
sns.countplot(x='diabetes', data=df)
plt.title('Distribution of Diabetes')
plt.xlabel('Diabetes (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.show()

# Optionally, you can also calculate the percentage of each class
diabetes_percentage = (diabetes_counts / len(df)) * 100
print("\nPercentage of Diabetes Classes:\n", diabetes_percentage)

In [None]:
# Import necessary libraries
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level','hypertension','heart_disease']),
        ('cat', OneHotEncoder(), ['gender','smoking_history'])
    ])

# Split data into features and target variable
X = df.drop('diabetes', axis=1)
y = df['diabetes']

In [None]:
# Import necessary libraries
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.pipeline import Pipeline as imbPipeline # Import imbPipeline from imblearn.pipeline
from sklearn.ensemble import RandomForestClassifier

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level','hypertension','heart_disease']),
        ('cat', OneHotEncoder(), ['gender','smoking_history'])
    ])

# Create a pipeline that preprocesses the data, resamples data, and then trains a classifier
clf = imbPipeline(steps=[('preprocessor', preprocessor),
                      ('over', over),
                      ('under', under),
                      ('classifier', RandomForestClassifier())]) # Use the imported imbPipeline

In [None]:
# prompt: # Define the hyperparameters and the values we want to test

param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

In [None]:
# Import necessary libraries
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.pipeline import Pipeline as imbPipeline # Import imbPipeline from imblearn.pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV # Import GridSearchCV

# Create Grid Search object
grid_search = GridSearchCV(clf, param_grid, cv=5)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters: ", grid_search.best_params_)

In [None]:
# prompt: # Evaluate the model
# andplot

# Evaluate the model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# Get the best estimator from the grid search
best_estimator = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_estimator.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC AUC:", roc_auc)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# Plot ROC curve
from sklearn.metrics import roc_curve, auc
y_probs = best_estimator.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()