# EDA

In [26]:
# Import important libraries + storing our data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

data = pd.read_csv("sample_data/diabetes_prediction_dataset.csv")

In [None]:
# Taking a quick look at the data we are dealing with
data

In [None]:
data.shape

# we got 100k rows with 9 columns (9 features)
# the data is kinda huge

In [None]:
# looking at the first 5 records

data.head()

In [None]:
# looking at the last 5 records

data.tail()

In [None]:
# listing all features to see them

list(data.columns)
# we can see multiple features , and the diabetes (our target) is the last one

In [None]:
# cheking the data type of each feature , checking the numerical/caterogical

data.dtypes


# quick notes :
# 1. age could've been int not float , because it doesnt need it
# 2. hypertension/heart_disease/diabetes , all these columns are stored 'int' as a numerical value , but in fact its not , those are yes/no question !

In [None]:
# lets see some quick stats on the data

data.describe()

In [None]:
# i wanna see how many patients has diabetes and how many does not from the data set we got
diabetes_counts = data['diabetes'].value_counts()
diabetes_counts

# so only 8500 from the 100k has diabetese , which is imbalance
# we must try to make it more balance

In [None]:
# looking if there are some missing values...

data.isna().sum()

# There are no missing values , great news l !

In [None]:
# Calculate correlation with diabetes for all numerical features
correlation_with_diabetes = data.corr(numeric_only=True)['diabetes'].sort_values(ascending=False)

# Display the correlation values
for feature, corr_value in correlation_with_diabetes.items():
    print(f"{feature:25s}: {corr_value:6.3f}")

# Create a visualization
plt.figure(figsize=(10, 6))
bars = plt.barh(correlation_with_diabetes.index, correlation_with_diabetes.values, color='steelblue')
plt.xlabel('Correlation Coefficient')
plt.title('Correlation of Features with Diabetes Diagnosis')
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)

# Add value labels to bars
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.01, bar.get_y() + bar.get_height()/2,
             f'{width:.3f}', ha='left', va='center')

plt.tight_layout()
plt.show()

# Now if the correlation is high (this means it has a string relation with the diabetes)
#, this means that the feature is important and affects the our target !
# we can see that the blood_glucose_level  affects the target the most

Now i will walk through each column , and make some stuff on it , stay w me :)



In [None]:
# visualize using bar char since its caterogical, so we could easily check

data['gender'].value_counts().plot(kind='bar')

plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

In [None]:
# Create a cross-tabulation (contingency table)
cross_tab = pd.crosstab(data['gender'], data['diabetes'])

# Plot the stacked bar chart
cross_tab.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Relationship between gender and diabetes')
plt.xlabel('Categories in Column1')
plt.ylabel('Count')
plt.legend(title='Column2')
plt.show()

#from the shape , the gender doesnt really matter !

In [None]:
# lets go the Age now , which will be more intresting ig
# lets make the quick stats on it
data["age"].describe()

# '''
# 1. the average age is 41 , not that old
# 2. max age is 80 , which is ok
# 3. the min age is 0.08 (8 months) which seems to be an outlier


In [None]:
# taking a quick look at the age distribution

plt.figure(figsize=(10, 6))
sns.histplot(data['age'], kde=True, bins=30)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# the data has a small left skewness as we can see
# we can see that there is so many people at age 80 , which is obviously not an outlier

In [None]:
# take a look at the skewness
from scipy.stats import skew, kurtosis
age_skew = skew(data['age'])
print(f"Skewness: {age_skew:.3f}")

# it actually has a very small negative skewness (more young people)

In [None]:
# outlier detection with IQR
Q1 = data['age'].quantile(0.25)
Q3 = data['age'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = data[(data['age'] < lower_bound) | (data['age'] > upper_bound)]
print(f"Number of outliers: {len(outliers)}")

# we got 0 outliers , which was kinda clear from the disribtuion shape

In [None]:
# lets move to next feature (hypertension)
# lets see how many patients has hypertension
hypertension_counts = data['hypertension'].value_counts()
hypertension_counts

# so we got ~7500 out of 100k has hypertension

In [None]:
# bar char , same as previous but more clear to see

plt.figure(figsize=(8, 5))
sns.countplot(x='hypertension', data=data)
plt.title('Distribution of Hypertension Cases')
plt.xlabel('Hypertension (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.show()

In [None]:
# correlation with the target (diabetes)
hypertension_diabetes_corr = data['hypertension'].corr(data['diabetes'])
hypertension_diabetes_corr

# its positive , but not strong relation

In [None]:
# lets see how much does it affect (by actual numbers)
hypertension_diabetes = pd.crosstab(data['hypertension'], data['diabetes'])
hypertension_diabetes

# hmm , not much actually , as expected from the correlation !

In [None]:
# the heart_disease has almost the same as the hypertension , so i will skip it :)
# lets move to smoking_history and see whats going on here !
data['smoking_history'].unique()


In [None]:
# DISTRIBUTION ACROSS CATEGORIES
# Why: See how many people fall into each smoking categor
smoking_counts = data['smoking_history'].value_counts()
smoking_counts


In [None]:
# ISUALIZE DISTRIBUTION
# Why: Get a clear visual understanding of the smoking categories
plt.figure(figsize=(12, 6))
sns.countplot(y='smoking_history', data=data, order=data['smoking_history'].value_counts().index)
plt.title('Distribution of Smoking History Categories')
plt.xlabel('Count')
plt.ylabel('Smoking History')
plt.show()

In [None]:
# RELATIONSHIP WITH DIABETES
# Why: Understand how smoking history relates to diabetes risk
diabetes_by_smoking = data.groupby('smoking_history')['diabetes'].mean() * 100

diabetes_by_smoking.round(2)



In [None]:
# VISUALIZE DIABETES RELATIONSHIP
# why: See the pattern of diabetes risk across smoking categories
plt.figure(figsize=(12, 6))
sns.barplot(x='smoking_history', y='diabetes', data=data, errorbar=None)
plt.title('Diabetes Prevalence by Smoking History')
plt.xlabel('Smoking History')
plt.ylabel('Diabetes Prevalence')
plt.xticks(rotation=45)
plt.show()

# we can clearly see that the formers are the most who have diabetes which makes sense (in a medical way)

In [None]:
# now lets move on bmi
# BASIC STATISTICS
# Why: Understand the distribution of BMI values in your dataset
data['bmi'].describe()

# now the average healthy bmi is 18-25 !
# we got 10 min , which is wierd !
# and the max is 95 !! its kinda obvious that this is an outlier

In [None]:
# taking a deep look at the bmi distribution

plt.figure(figsize=(10, 6))
sns.histplot(data['bmi'], kde=True, bins=30)
plt.title('bmi Distribution')
plt.xlabel('bmi')
plt.ylabel('Frequency')
plt.show()

# we can surely tell that there are some outliers hiding here !

In [None]:
# see the shape of the BMI distribution using box plot
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 2)
sns.boxplot(x=data['bmi'])
plt.title('BMI Box Plot')

plt.tight_layout()
plt.show()

# and holyy outleirs

In [None]:
# outlier detection with IQR
Q1 = data['bmi'].quantile(0.25)
Q3 = data['bmi'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = data[(data['bmi'] < lower_bound) | (data['bmi'] > upper_bound)]
print(f"Number of outliers: {len(outliers)}")

# aaand as its expected , theres alot of outliers , and we must deal with them !

In [None]:
# lets move to the next featrue (HbA1c_level)
# take a quick look at the stats
data["HbA1c_level"].describe()

In [None]:
# see the shape of the HbA1c distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(data['HbA1c_level'], bins=30)
plt.title('HbA1c Distribution')
plt.xlabel('HbA1c (%)')

plt.tight_layout()
plt.show()

# i cant really decide anything based on this shape

In [None]:
# lets do a box plot

plt.subplot(1, 2, 2)
sns.boxplot(x=data['HbA1c_level'])
plt.title('HbA1c Box Plot')
plt.xlabel('HbA1c (%)')

plt.tight_layout()
plt.show()

# i can see some outliers


In [None]:
# outlier detection with IQR
Q1 = data['HbA1c_level'].quantile(0.25)
Q3 = data['HbA1c_level'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = data[(data['HbA1c_level'] < lower_bound) | (data['HbA1c_level'] > upper_bound)]
print(f"Number of outliers: {len(outliers)}")

# there are 1315 outlier !
# i asked gpt , he said its normal because the person who got diabetes will have > 6.5%
# so i will see what i should do in the proccessing phase

In [None]:
# calculate correlation
hba1c_diabetes_corr = data['HbA1c_level'].corr(data['diabetes'])
hba1c_diabetes_corr

# they got a strong relation

In [None]:
# lets move to the next featrue (blood_glucose_level)
# take a quick look at the stats
data["blood_glucose_level"].describe()

In [None]:
# see the shape of the blood_glucose_level distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(data['blood_glucose_level'], bins=30)
plt.title('blood_glucose_level Distribution')
plt.xlabel('blood_glucose_level')

plt.tight_layout()
plt.show()

# i cant really decide anything based on this shape also

In [None]:
# lets do a box plot

plt.subplot(1, 2, 2)
sns.boxplot(x=data['blood_glucose_level'])
plt.title('blood_glucose_level Box Plot')
plt.xlabel('blood_glucose_level ')

plt.tight_layout()
plt.show()

# i can see some outliers here also

In [None]:
# outlier detection with IQR
Q1 = data['blood_glucose_level'].quantile(0.25)
Q3 = data['blood_glucose_level'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = data[(data['blood_glucose_level'] < lower_bound) | (data['blood_glucose_level'] > upper_bound)]
print(f"Number of outliers: {len(outliers)}")

# there are 2038 outlier !
# i need to go deep on the outliers  , and check if they are really outliers or not !!

In [None]:
# BASIC STATS OF THE OUTLIERS THEMSELVES
print("=== BASIC OUTLIER ANALYSIS ===")
print(f"Number of outliers: {len(outliers)}")
print(f"Percentage of total data: {(len(outliers)/len(data))*100:.2f}%")
print("\n")

# CHECK WHERE THEY ARE (High or Low)
print("=== OUTLIER DIRECTION ===")
high_outliers = outliers[outliers['blood_glucose_level'] > upper_bound]
low_outliers = outliers[outliers['blood_glucose_level'] < lower_bound]
print(f"High outliers (> {upper_bound:.1f}): {len(high_outliers)}")
print(f"Low outliers (< {lower_bound:.1f}): {len(low_outliers)}")
print("\n")

# CHECK THEIR DIABETES STATUS
print("=== DIABETES STATUS OF OUTLIERS ===")
print("Diabetes prevalence in outliers:")
print(outliers['diabetes'].value_counts())


# alr alr alr , these are not outliers , and i asked gpt again , as we can see , all outliers has diabetes !
# , he said > 247.5 is very normal blood glucose level for a person who have diabetes !
# thats a great sign !

# Data Proccessing

Now based on the EDA , i will be doing a few things here !

## 1. Removing Duplicated

We remove duplicated rows , to avoid overfitting !

In [34]:
duplicates = data.duplicated().sum()
data = data.drop_duplicates()

## 2. Fixing some Data types

In [36]:
data['age'] = data['age'].astype(int)

binary_columns = ['hypertension', 'heart_disease', 'diabetes']
for col in binary_columns:
    data[col] = data[col].astype('category')


## 3. Dealing with outliers

In [37]:
reasonable_min_age = 1
reasonable_max_age = 120

age_outliers = data[(data['age'] < reasonable_min_age) | (data['age'] > reasonable_max_age)]

data['age'] = data['age'].clip(lower=reasonable_min_age, upper=reasonable_max_age)

reasonable_min_bmi = 12
reasonable_max_bmi = 60

bmi_outliers = data[(data['bmi'] < reasonable_min_bmi) | (data['bmi'] > reasonable_max_bmi)]

data['bmi'] = data['bmi'].clip(lower=reasonable_min_bmi, upper=reasonable_max_bmi)


## 4. Create a new Features

In [38]:
data['glucose_high'] = (data['blood_glucose_level'] > 200).astype(int)

glucose_high_counts = data['glucose_high'].value_counts()

glucose_high_corr = data['glucose_high'].corr(data['diabetes'])

data['diabetes_by_hba1c'] = (data['HbA1c_level'] >= 6.5).astype(int)

accuracy = (data['diabetes_by_hba1c'] == data['diabetes']).mean()

disagreements = data[data['diabetes_by_hba1c'] != data['diabetes']]


# Model Training

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

data_encoded = pd.get_dummies(data, columns=["gender", "smoking_history"])
data_encoded.dropna(inplace=True)
X = data_encoded.drop("diabetes", axis=1)
y = data_encoded["diabetes"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(x_train, y_train)


In [28]:
y_predict = clf.predict(x_test)
type(y_predict)

numpy.ndarray

In [29]:
pd.DataFrame(y_predict).to_csv('predictied.csv')

# Evaluate

In [30]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_predict)

0.97015

In [31]:
tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_predict).ravel()

In [32]:
print(f'True Negative: {tn}')
print(f'False Positive: {fp}')
print(f'False Negative: {fn}')
print(f'True Positive: {tp}')

True Negative: 18225
False Positive: 67
False Negative: 530
True Positive: 1178
