In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

# Descriptive Statistics

In [None]:
data.head()

In [None]:
data.info()

In [None]:
numerical_columns = ['age', 'avg_glucose_level', 'bmi']

In [None]:
for column in numerical_columns:
    mean_val = data[column].mean()
    median_val = data[column].median()
    mode_val = data[column].mode().values[0]
    
    print(f"Column: {column}")
    print(f"Mean: {mean_val:.2f}")
    print(f"Median: {median_val:.2f}")
    print(f"Mode: {mode_val}\n")

In [None]:
data["smoking_status"].value_counts()

In [None]:
#Count of categorical variables
print("\nCount of Categorical Variables:")
print("\n",data['gender'].value_counts())
print("\n",data['ever_married'].value_counts())
print("\n",data['work_type'].value_counts())
print("\n",data['Residence_type'].value_counts())
print("\n",data['smoking_status'].value_counts())
print("\n",data['hypertension'].value_counts())
print("\n",data['heart_disease'].value_counts())
print("\n",data['stroke'].value_counts())

In [None]:
# Countplot for categorical variables
plt.figure(figsize=(5, 3))
sns.countplot(x='gender', hue='stroke', data=data)
plt.title('Gender vs. Stroke')
plt.show()

plt.figure(figsize=(5, 3))
sns.countplot(x='work_type', hue='stroke', data=data)
plt.title('Work Type vs. Stroke')
plt.show()

plt.figure(figsize=(5, 3))
sns.countplot(x='ever_married', hue='stroke', data=data)
plt.title('Ever Married vs. Stroke')
plt.show()

plt.figure(figsize=(5, 3))
sns.countplot(x='Residence_type', hue='stroke', data=data)
plt.title('Residence Type vs. Stroke')
plt.show()

plt.figure(figsize=(5, 3))
sns.countplot(x='smoking_status', hue='stroke', data=data)
plt.title('Smoking Status vs. Stroke')
plt.show()

plt.figure(figsize=(5, 3))
sns.countplot(x='hypertension', hue='stroke', data=data)
plt.title('Hypertension vs. Stroke')
plt.show()

plt.figure(figsize=(5, 3))
sns.countplot(x='heart_disease', hue='stroke', data=data)
plt.title('Heart Disease vs. Stroke')
plt.show()

In [None]:
# Boxplot for numerical variables
plt.figure(figsize=(8, 6))
sns.boxplot(data=data[numerical_columns])
plt.title('Box Plot for Numerical Variables')
plt.show()

> #  **Range**

In [None]:
print('max Age :',data['age'].max())
print('Min Age :',data['age'].min())
print('Range :',data['age'].max()-data['age'].min())

## Inference
-------
The age distribution in the dataset shows a wide range, ranging from a minimum age of 0.08 years (approximately 1 month) to a maximum age of 82.0 years. The age range spans over 81.92 years, indicating significant variability in the ages of individuals represented in the dataset.

This diverse age range is important for understanding the potential impact of age on stroke prediction. The dataset includes individuals ranging from infants to elderly adults, encompassing various life stages and age-related health conditions. Analyzing stroke occurrences across such a broad age spectrum can provide valuable insights into the relationship between age and stroke risk.

> #  **Interquartile Range (IQR)**

In [None]:
 data['bmi'].quantile(0.75)-data['bmi'].quantile(0.25)

## Inference
-------
The interquartile range (IQR) for the 'bmi' (Body Mass Index) variable in the dataset is approximately 9.60 units. The IQR is a measure of statistical dispersion and represents the range of values that encompass the middle 50% of the data.

A larger IQR indicates greater variability or spread in the data. In this context, a 9.60-unit IQR for BMI suggests significant variation in BMI values among the individuals represented in the dataset.

The IQR is useful for understanding the central distribution of BMI values and helps identify potential outliers or extreme values that lie outside the middle 50% range. By focusing on the middle 50% of the data, the IQR provides a more robust representation of the typical BMI values observed in the dataset, minimizing the influence of extreme values.

> #  **Variance**

In [None]:
# Calculate the variance of 'avg_glucose_level'
avg_glucose_level_variance = data['avg_glucose_level'].var()

print(f"Variance of avg_glucose_level: {avg_glucose_level_variance:.2f}")

## Inference
-------
The variance of the 'avg_glucose_level' variable in the dataset is calculated to be approximately 2050.60. Variance is a statistical measure that quantifies the spread or dispersion of values in a dataset around the mean.

In the context of 'avg_glucose_level', a variance of 2050.60 suggests significant variability in the glucose level values among the individuals represented in the dataset. This variability may be indicative of diverse glucose level patterns across the population and can be relevant for understanding the distribution of glucose levels and potential implications for health and disease risk.

In [None]:
# Calculate the variance of 'bmi'
bmi_variance = data['bmi'].var()

print(f"Variance of bmi: {bmi_variance:.2f}")

## Inference 
------
The variance of the 'bmi' (Body Mass Index) variable in the dataset is calculated to be approximately 61.69. Variance is a statistical measure that quantifies the spread or dispersion of values in a dataset around the mean.

For the 'bmi' variable, a variance of 61.69 indicates that there is notable variability in BMI values among the individuals represented in the dataset. This variability suggests that there are diverse body mass index values, reflecting different body compositions and weights within the population under study.

> #  **Standard Deviation**

In [None]:
# Calculate the standard deviation of 'avg_glucose_level'
avg_glucose_level_standard_deviation = data['avg_glucose_level'].std()

print(f"standard deviation of avg_glucose_level: {avg_glucose_level_standard_deviation:.2f}")

## Inference
-------
The standard deviation of the 'avg_glucose_level' variable in the dataset is calculated to be approximately 45.28. The standard deviation is a statistical measure that quantifies the dispersion or variability of values in a dataset around the mean.

For the 'avg_glucose_level' variable, a standard deviation of 45.28 indicates that the glucose level values among the individuals in the dataset exhibit considerable variability around the mean glucose level. This variability suggests that there is a notable spread of glucose levels across the population under study.

A higher standard deviation in 'avg_glucose_level' implies that the glucose level values are more widely scattered from the mean, indicating a broader range of glucose levels among the individuals. This variability is important for understanding the distribution of glucose levels and may have clinical implications, particularly in the context of health conditions related to glucose regulation, such as diabetes or metabolic disorders.

In [None]:
# Calculate the standard deviation of 'bmi'
bmi_standard_deviation = data['bmi'].std()

print(f"standard deviation of bmi: {bmi_standard_deviation:.2f}")

> #  **Coefficient of Variance**

In [None]:
print('coefficient of variation of avg_glucose_level:',data['avg_glucose_level'].std()/data['avg_glucose_level'].mean())
print('% of variation:',data['avg_glucose_level'].std()/data['avg_glucose_level'].mean() * 100)

## Inference
-----
The coefficient of variation and percentage of variation for the 'avg_glucose_level' variable in the dataset are calculated to be approximately 0.43 and 42.66%, respectively.

The coefficient of variation (CV) is a statistical measure that represents the relative variability of a dataset in relation to its mean. It is calculated by dividing the standard deviation by the mean. In this case, the coefficient of variation for 'avg_glucose_level' is approximately 0.43, indicating that the standard deviation is 43% of the mean glucose level. A higher CV value suggests a relatively higher dispersion or spread of values around the mean, indicating greater variability in glucose level measurements.

These measures are useful in assessing the consistency and stability of glucose level measurements within the dataset. A higher coefficient of variation and percentage of variation can imply that the glucose levels among individuals have significant variability around the mean, indicating diverse glucose level patterns.

In [None]:
print('coefficient of variation of bmi:',data['bmi'].std()/data['bmi'].mean())
print('% of variation:',data['bmi'].std()/data['bmi'].mean() * 100)

> #  **Describe**

In [None]:
data.describe(include='all')

> #  **Skewness**

In [None]:
data.skew()

## Inference
-----
The values provided represent the skewness of different variables in the dataset. Skewness is a measure of the asymmetry of the probability distribution of a variable. A positive skewness value indicates that the distribution is skewed to the right (tail is longer on the right side), while a negative skewness value indicates a left-skewed distribution (tail is longer on the left side). A skewness value close to zero indicates a relatively symmetric distribution.

Here are the inferences for each variable based on their skewness values:

id: The 'id' variable has a skewness value of approximately -0.02, indicating a slightly left-skewed distribution. This might suggest that the dataset has slightly more lower 'id' values compared to higher ones.

age: The 'age' variable has a skewness value of approximately -0.14, suggesting a very slight left-skewed distribution. This implies that the dataset might have a slightly higher concentration of younger individuals.

hypertension: The 'hypertension' variable has a skewness value of approximately 2.72, indicating a highly right-skewed distribution. This suggests that there might be a higher proportion of individuals without hypertension compared to those with hypertension in the dataset.

heart_disease: The 'heart_disease' variable has a skewness value of approximately 3.95, indicating a highly right-skewed distribution. This suggests that there might be a higher proportion of individuals without heart disease compared to those with heart disease in the dataset.

avg_glucose_level: The 'avg_glucose_level' variable has a skewness value of approximately 1.57, indicating a moderately right-skewed distribution. This suggests that the dataset might have a concentration of lower average glucose levels with a tail extending towards higher glucose levels.

bmi: The 'bmi' (Body Mass Index) variable has a skewness value of approximately 1.06, indicating a moderately right-skewed distribution. This suggests that the dataset might have a concentration of lower BMI values with a tail extending towards higher BMI values.

stroke: The 'stroke' variable (target variable) has a skewness value of approximately 4.19, indicating a highly right-skewed distribution. This suggests that there might be a higher proportion of individuals without stroke compared to those with stroke in the dataset.

In [None]:
skewness = data[numerical_columns].skew(numeric_only=True)
print("\nSkewness:")
print(skewness)

> #  **Kurtosis**

In [None]:
data.kurt()

## Inference
-----
The values provided represent the kurtosis of different variables in the dataset. Kurtosis is a measure of the peakedness or flatness of the probability distribution of a variable compared to a normal distribution. A positive kurtosis value indicates a relatively more peaked distribution with heavier tails, while a negative kurtosis value indicates a flatter distribution with lighter tails. A kurtosis value close to zero indicates a distribution similar to a normal distribution.

Here are the inferences for each variable based on their kurtosis values:

id: The 'id' variable has a kurtosis value of approximately -1.21, indicating a slightly flatter distribution compared to a normal distribution. This implies that the distribution of 'id' values has lighter tails and is less peaked.

age: The 'age' variable has a kurtosis value of approximately -0.99, indicating a distribution that is slightly flatter than a normal distribution. This suggests that the distribution of ages in the dataset has lighter tails and is less peaked.

hypertension: The 'hypertension' variable has a kurtosis value of approximately 5.38, indicating a distribution that is more peaked with heavier tails compared to a normal distribution. This suggests that the distribution of hypertension status has more extreme values.

heart_disease: The 'heart_disease' variable has a kurtosis value of approximately 13.59, indicating a highly peaked distribution with very heavy tails. This suggests that the distribution of heart disease status has a significant number of extreme values.

avg_glucose_level: The 'avg_glucose_level' variable has a kurtosis value of approximately 1.68, indicating a distribution that is slightly more peaked with heavier tails compared to a normal distribution. This suggests that the distribution of average glucose levels has some extreme values.

bmi: The 'bmi' (Body Mass Index) variable has a kurtosis value of approximately 3.36, indicating a distribution that is more peaked with heavier tails compared to a normal distribution. This suggests that the distribution of BMI values has more extreme values.

stroke: The 'stroke' variable (target variable) has a kurtosis value of approximately 15.59, indicating a highly peaked distribution with very heavy tails. This suggests that the distribution of stroke occurrence has a significant number of extreme values.

In [None]:
Kurtosis = data[numerical_columns].kurt(numeric_only=True)
print(Kurtosis)

> #  **Correlation**

In [None]:
correlation_matrix = data[numerical_columns + ['stroke']].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

## Inference
-----
The correlation matrix shown above provides valuable insights into the relationships between different numerical variables in the dataset. Correlation measures the strength and direction of linear relationships between pairs of variables. The values in the matrix range from -1 to 1, where:

A positive value indicates a positive correlation, meaning that when one variable increases, the other tends to increase as well.
A negative value indicates a negative correlation, meaning that when one variable increases, the other tends to decrease.
A value close to 0 indicates a weak or no linear correlation between the variables.
Here are the inferences based on the correlation matrix:

Age vs. Avg_glucose_level: The correlation coefficient between 'age' and 'avg_glucose_level' is approximately 0.24, indicating a weak positive correlation. This suggests that there is a slight tendency for glucose levels to increase slightly with age. However, the correlation is not strong, indicating that age is not a major predictor of average glucose level in the dataset.

Age vs. BMI: The correlation coefficient between 'age' and 'bmi' is approximately 0.33, indicating a weak positive correlation. This suggests that there is a slight tendency for BMI to increase slightly with age. However, as with 'avg_glucose_level', the correlation is not strong, indicating that age is not a major predictor of BMI in the dataset.

Age vs. Stroke: The correlation coefficient between 'age' and 'stroke' is approximately 0.25, indicating a weak positive correlation. This suggests that there is a slight tendency for stroke occurrence to increase slightly with age. Again, the correlation is not strong, indicating that age is not the sole determinant of stroke risk in the dataset.

Avg_glucose_level vs. BMI: The correlation coefficient between 'avg_glucose_level' and 'bmi' is approximately 0.18, indicating a weak positive correlation. This suggests that there is a slight tendency for average glucose levels to increase slightly with BMI. However, the correlation is not strong, indicating that BMI is not a major predictor of average glucose level in the dataset.

Avg_glucose_level vs. Stroke: The correlation coefficient between 'avg_glucose_level' and 'stroke' is approximately 0.13, indicating a weak positive correlation. This suggests that there is a slight tendency for stroke occurrence to increase slightly with higher average glucose levels. However, the correlation is not strong, indicating that average glucose level is not the sole determinant of stroke risk in the dataset.

BMI vs. Stroke: The correlation coefficient between 'bmi' and 'stroke' is approximately 0.04, indicating a very weak positive correlation. This suggests that there is a very slight tendency for stroke occurrence to increase slightly with higher BMI. However, the correlation is extremely weak, indicating that BMI is not a strong predictor of stroke risk in the dataset.

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
data.corr()

In [None]:
sns.heatmap(data.corr(),annot=True)
plt.show()

> #  **Covariance**

In [None]:
data.cov()

In [None]:
cov_matrix = data[numerical_columns].cov()

# Step 3: Create the covariance matrix heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cov_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Covariance Matrix Heatmap')
plt.show()

In [None]:
sns.heatmap(data.cov(),annot=True)
plt.show()