<a href="https://colab.research.google.com/github/Anna-Bialer-Tsypin/Jhon-Bryce/blob/main/stroke_eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go
import scipy.stats as stats
from plotly.subplots import make_subplots

In [None]:
# Set Plotly as Pandas plotting backend

pd.options.plotting.backend = "plotly"
np.set_printoptions(precision=2, suppress=True)
pd.options.display.precision = 2
pd.options.display.float_format = '{:.2f}'.format

# Uploading data

In [None]:
# Download latest version
path = kagglehub.dataset_download("shriyashjagtap/stroke-diagnosis-and-health-metrics-data")

# Extract path from caggle
print("Path to dataset files:", path)

#Extract file name
print(os.listdir(path))

Path to dataset files: /root/.cache/kagglehub/datasets/shriyashjagtap/stroke-diagnosis-and-health-metrics-data/versions/1
['stroke_data.csv']


In [None]:
# Load the dataset into a Pandas DataFrame
df = pd.read_csv(path+'/'+str(os.listdir(path)[0]))
df.rename(columns={col: col.lower() for col in df.columns}, inplace=True)

#Data Preprocessing

In [None]:
df

Unnamed: 0,age,gender,ses,hypertension,heart_disease,bmi,avg_glucose,diabetes,smoking_status,stroke
0,66.79,Male,Medium,1,0,25.84,96.43,1,Current,1
1,86.39,Female,Medium,1,1,32.53,133.35,0,Never,1
2,76.16,Female,Medium,1,0,40.68,111.49,0,Never,0
3,72.48,Female,Low,0,1,33.00,125.69,0,Former,0
4,59.88,Male,Low,0,0,26.06,123.22,1,Never,1
...,...,...,...,...,...,...,...,...,...,...
9995,80.65,Female,Medium,1,1,28.28,128.64,0,Former,1
9996,82.61,Female,Medium,1,1,31.15,105.03,1,Current,1
9997,86.02,Female,High,1,1,32.64,147.17,0,Never,1
9998,67.39,Female,Medium,0,1,29.95,112.48,0,Former,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             10000 non-null  float64
 1   gender          10000 non-null  object 
 2   ses             10000 non-null  object 
 3   hypertension    10000 non-null  int64  
 4   heart_disease   10000 non-null  int64  
 5   bmi             10000 non-null  float64
 6   avg_glucose     10000 non-null  float64
 7   diabetes        10000 non-null  int64  
 8   smoking_status  10000 non-null  object 
 9   stroke          10000 non-null  int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 781.4+ KB


In [None]:
# Evaluation of Nulls
df.isnull().sum()

Unnamed: 0,0
age,0
gender,0
ses,0
hypertension,0
heart_disease,0
bmi,0
avg_glucose,0
diabetes,0
smoking_status,0
stroke,0


In [None]:
# Evaluation of duplicates

df.duplicated().sum()

np.int64(0)

In [None]:
# Initial evlauation of unique values
df.nunique()

Unnamed: 0,0
age,10000
gender,2
ses,3
hypertension,2
heart_disease,2
bmi,10000
avg_glucose,10000
diabetes,2
smoking_status,3
stroke,2




*   We have no missing values and no duplicates



In [None]:
# Evaluate Unique columns
unique_values = {col: df[col].unique() for col in df.columns}
for col in df.columns:
    print(f"{col}: {df[col].unique()}")

age: [66.79 86.39 76.16 ... 86.02 67.39 62.17]
gender: ['Male' 'Female']
ses: ['Medium' 'Low' 'High']
hypertension: [1 0]
heart_disease: [0 1]
bmi: [25.84 32.53 40.68 ... 32.64 29.95 32.05]
avg_glucose: [ 96.43 133.35 111.49 ... 147.17 112.48  77.47]
diabetes: [1 0]
smoking_status: ['Current' 'Never' 'Former']
stroke: [1 0]


In [None]:
# Change gender into 0/1
df['is_male'] = df['gender'].str.lower().map({'male': 1, 'female': 0})
df.drop(columns=['gender'], inplace=True)

# All binary columns: exactly 2 unique values, either numeric  or categorical
binary_cols = [
    col for col in df.columns
    if df[col].nunique() == 2
]

# Categorical columns: object or category dtype with more than 2 unique values
categorical_cols = [
    col for col in df.select_dtypes(include=['object', 'category']).columns
    if df[col].nunique() > 2
]

# Numerical columns: numeric dtype excluding the binary ones
numerical_cols = [
    col for col in df.select_dtypes(include=['number']).columns
    if col not in binary_cols
]

# Change categorial columns from object to category
df[categorical_cols] = df[categorical_cols].astype('category')



print("Binary columns:", binary_cols)
print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

Binary columns: ['hypertension', 'heart_disease', 'diabetes', 'stroke', 'is_male']
Categorical columns: ['ses', 'smoking_status']
Numerical columns: ['age', 'bmi', 'avg_glucose']


In [None]:
# Minimize dtypes of numerical values
df[df.select_dtypes(include=['number']).columns] = df.select_dtypes(include=['number']).apply(pd.to_numeric, downcast='integer').apply(pd.to_numeric, downcast='float')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             10000 non-null  float32 
 1   ses             10000 non-null  category
 2   hypertension    10000 non-null  int8    
 3   heart_disease   10000 non-null  int8    
 4   bmi             10000 non-null  float32 
 5   avg_glucose     10000 non-null  float32 
 6   diabetes        10000 non-null  int8    
 7   smoking_status  10000 non-null  category
 8   stroke          10000 non-null  int8    
 9   is_male         10000 non-null  int8    
dtypes: category(2), float32(3), int8(5)
memory usage: 185.9 KB


# Distribution of data

In [None]:
df.describe().round(2)

Unnamed: 0,age,hypertension,heart_disease,bmi,avg_glucose,diabetes,stroke,is_male
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,69.76,0.61,0.3,28.04,109.12,0.2,0.3,0.5
std,9.87,0.49,0.46,4.9,16.42,0.4,0.46,0.5
min,27.71,0.0,0.0,15.04,45.57,0.0,0.0,0.0
25%,63.13,0.0,0.0,24.66,98.19,0.0,0.0,0.0
50%,69.8,1.0,0.0,28.07,109.02,0.0,0.0,0.0
75%,76.4,1.0,1.0,31.39,120.34,0.0,1.0,1.0
max,99.42,1.0,1.0,47.5,176.18,1.0,1.0,1.0


In [None]:
# ---- 1. Stroke rate in total population ----
total_population = len(df)
stroke_count = df[df['stroke'] == 1].shape[0]
stroke_rate = stroke_count / total_population * 100

# ---- 2. Stroke rate by gender ----
stroke_by_gender = df[df['stroke'] == 1]['is_male'].value_counts(normalize=True) * 100
stroke_women = stroke_by_gender.get(0, 0)
stroke_men = stroke_by_gender.get(1, 0)

# ---- 3. Stroke rate among patients with diabetes ----
diabetes_stroke = df[(df['diabetes'] == 1) & (df['stroke'] == 1)].shape[0]
diabetes_total = df[df['diabetes'] == 1].shape[0]
stroke_in_diabetes = (diabetes_stroke / diabetes_total) * 100 if diabetes_total > 0 else 0

# ---- 4. Stroke rate among patients with heart disease ----
heart_stroke = df[(df['heart_disease'] == 1) & (df['stroke'] == 1)].shape[0]
heart_total = df[df['heart_disease'] == 1].shape[0]
stroke_in_heart_disease = (heart_stroke / heart_total) * 100 if heart_total > 0 else 0

# ---- Print Results ----
print(f"* Approximately {stroke_rate:.1f}% of the population experiences a stroke, "
      f"with {stroke_women:.1f}% being women and {stroke_men:.1f}% being men.")

print(f"* Among patients, {stroke_in_diabetes:.1f}% of individuals with diabetes and "
      f"{stroke_in_heart_disease:.1f}% of those with heart disease have suffered a stroke.")

* Approximately 29.8% of the population experiences a stroke, with 50.0% being women and 50.0% being men.
* Among patients, 52.8% of individuals with diabetes and 48.8% of those with heart disease have suffered a stroke.


In [None]:
# Numerical columns distribution
for col in numerical_cols:
    fig = px.histogram(df, x=col, title=f'Distribution of {col}', marginal='box', nbins=30,width=700, height=400)
    fig.show()

# Data Evaluation
* There are 10000 values in the data set
* There are no missing data
* Numerical data seems as normal distriburion
* There are no extreme cases of outliers
* Male and Female are distributed equally throught the data set.
* Main age group in data set is 45-95 (total age group is between 27-99)

In [None]:
# Categorical column distributions
for col in categorical_cols:
    counts = df[col].value_counts().reset_index()
    counts.columns = [col, 'count']  # rename columns explicitly

    fig = px.bar(counts, x=col, y='count', title=f'Category Counts for {col}')
    fig.update_layout(xaxis_title=col, yaxis_title='Count',width=700, height=400)
    fig.show()

In [None]:
# Binary column distributions
for col in binary_cols:
    counts = df[col].value_counts().reset_index()
    counts.columns = [col, 'count']  # rename columns explicitly

    fig = px.bar(counts, x=col, y='count', title=f'Binary Counts for {col}')
    fig.update_layout(xaxis_title=col, yaxis_title='Count',width=700, height=400)
    fig.show()

# Evaluating Correlations

In [None]:
# Correlation heatmap all

# Make a copy to avoid modifying the original df
df_corr_ready = df.copy()

# Convert all bool columns to integers (0/1)
bool_cols = df_corr_ready.select_dtypes(include='bool').columns
df_corr_ready[bool_cols] = df_corr_ready[bool_cols].astype(int)

# Drop all categorical columns (object and category)
df_corr_ready = df_corr_ready.select_dtypes(exclude=['object', 'category'])

# Compute correlation matrix
correlation_matrix = df_corr_ready.corr()

# Plot
fig = px.imshow(
    correlation_matrix,
    text_auto=".2f",
    color_continuous_scale='RdBu',
    zmin=-1, zmax=1,
    title="Correlation Matrix (Including Boolean Columns)"
)
fig.update_layout(
    width=800,
    height=700,
    xaxis_title="Variable",
    yaxis_title="Variable"
)
fig.show()

In [None]:
correlation_matrix

Unnamed: 0,age,hypertension,heart_disease,bmi,avg_glucose,diabetes,stroke,is_male
age,1.0,0.42,0.11,-0.0,0.14,0.13,0.36,0.01
hypertension,0.42,1.0,0.05,-0.0,0.31,0.06,0.39,0.01
heart_disease,0.11,0.05,1.0,-0.0,0.3,0.02,0.27,-0.01
bmi,-0.0,-0.0,-0.0,1.0,0.01,-0.0,0.08,-0.0
avg_glucose,0.14,0.31,0.3,0.01,1.0,0.01,0.22,0.01
diabetes,0.13,0.06,0.02,-0.0,0.01,1.0,0.25,-0.01
stroke,0.36,0.39,0.27,0.08,0.22,0.25,1.0,0.0
is_male,0.01,0.01,-0.01,-0.0,0.01,-0.01,0.0,1.0


In [None]:
ct_ses = pd.crosstab(df['stroke'], df['ses'], normalize=True)

# Reset index for plotting
ct_ses_reset = ct_ses.reset_index().melt(id_vars='stroke', var_name='SES', value_name='Proportion')

# Plot
fig = px.density_heatmap(
    ct_ses_reset,
    x='SES',
    y='stroke',
    z='Proportion',
    color_continuous_scale='Blues',
    text_auto=True
)
fig.update_layout(title='Stroke vs Socioeconomic Status (Normalized)', yaxis_title='Stroke', xaxis_title='SES',width=800, height=600)
fig.show()

In [None]:
ct_ses

ses,High,Low,Medium
stroke,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.14,0.21,0.35
1,0.06,0.08,0.16


In [None]:
# Crosstab with column normalization
ct_ses_col = pd.crosstab(df['stroke'], df['ses'], normalize='columns')
ct_ses_col = ct_ses_col.T.reset_index().melt(id_vars='ses', var_name='Stroke', value_name='Proportion')

fig = px.bar(
    ct_ses_col,
    x='ses',
    y='Proportion',
    color='Stroke',
    title='Proportion of Stroke by SES Group (Stacked)',
    barmode='stack',
    text_auto='.2f'
)
fig.update_layout(xaxis_title='SES', yaxis_title='Proportion',width=800, height=600)

fig.show()

# Smoking Status effect on stroke



In [None]:
ct_ss = pd.crosstab(df['stroke'], df['smoking_status'], normalize=True)

# Reset index for plotting
ct_ss_reset = ct_ss.reset_index().melt(id_vars='stroke', var_name='smoking_status', value_name='Proportion')

# Plot
fig = px.density_heatmap(
    ct_ss_reset,
    x='smoking_status',
    y='stroke',
    z='Proportion',
    color_continuous_scale='Blues',
    text_auto=True
)
fig.update_layout(title='Stroke vs Smoking Status (Normalized)', yaxis_title='Stroke', xaxis_title='SES',width=800, height=600)
fig.show()

In [None]:
ct_ss


smoking_status,Current,Former,Never
stroke,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.13,0.15,0.43
1,0.08,0.05,0.17


In [None]:
# Crosstab with column normalization
ct_ss_col = pd.crosstab(df['stroke'], df['smoking_status'], normalize='columns')
ct_ss_col = ct_ss_col.T.reset_index().melt(id_vars='smoking_status', var_name='Stroke', value_name='Proportion')

fig = px.bar(
    ct_ss_col,
    x='smoking_status',
    y='Proportion',
    color='Stroke',
    title='Proportion of Stroke by Smoking Status Group (Stacked)',
    barmode='stack',
    text_auto='.2f'
)
fig.update_layout(xaxis_title='Smoking Status', yaxis_title='Proportion',width=800, height=600)

fig.show()

In [None]:
ct_ss_col

Unnamed: 0,smoking_status,Stroke,Proportion
0,Current,0,0.63
1,Former,0,0.73
2,Never,0,0.72
3,Current,1,0.37
4,Former,1,0.27
5,Never,1,0.28


# Figures demonstrating correlations found

In [None]:
# Create box plot
fig = px.box(df, x='stroke', y='age', points='all', title='Age Distribution by Stroke')

# Set custom figure size
fig.update_layout(width=800, height=600)

# Show the figure
fig.show()

In [None]:
# Create violin plot
fig = px.violin(df, x='stroke', y='avg_glucose', box=True, points='all', title='Avg Glucose by Stroke')

# Set custom figure size
fig.update_layout(width=800, height=600)

# Show the figure
fig.show()

In [None]:
# Calculate proportion of stroke within each category of a binary variable
stroke_rate = df.groupby('diabetes')['stroke'].mean().reset_index()

fig = px.bar(stroke_rate, x='diabetes', y='stroke', title='Stroke Rate by Diabetes')
# Set custom figure size
fig.update_layout(width=800, height=600)

# Show the figure
fig.show()

In [None]:
fig = px.histogram(df, x='age', color='stroke', barmode='overlay', histnorm='probability density')
fig.update_layout(title='Age Distribution by Stroke',width=800, height=600)

# Show the figure
fig.show()

In [None]:
fig = px.histogram(df, x='bmi', color='stroke', barmode='overlay', histnorm='probability density')
fig.update_layout(title='BMI Distribution by Stroke',width=800, height=600)

# Show the figure
fig.show()

In [None]:
fig = px.histogram(df, x='bmi', color='heart_disease', barmode='overlay', histnorm='probability density')
fig.update_layout(title='BMI Distribution by heart disease',width=800, height=600)
fig.show()

In [None]:
fig = px.histogram(df, x='avg_glucose', color='stroke', barmode='overlay', histnorm='probability density')
fig.update_layout(title='Average Glucose Distribution by Stroke',width=800, height=600)
fig.show()

# Additional Gender exploration


In [None]:
# Age distribution
fig = px.histogram(
    df,
    x='age',
    color='is_male',
    barmode='overlay',  # use 'group' for side-by-side bars instead
    nbins=30,
    title='Age Distribution by Gender',
    labels={'age': 'Age', 'is_male': 'Gender'}
)

fig.update_layout(width=800, height=500)
fig.show()

In [None]:
# Step 1: Define age bins
age_bins = [0, 40, 50, 60, 70, 80, 120]
age_labels = ['<40', '40–49', '50–59', '60–69', '70–79', '80+']

df['age_group'] = pd.cut(df['age'], bins=age_bins, labels=age_labels, right=False)
df['is_male'] = df['is_male'].astype(bool)

# Step 2: Group by age group and gender to compute stroke rate
stroke_rates = df.groupby(['age_group', 'is_male'],observed=True)['stroke'].mean().reset_index()

# Step 3: Plot stroke rate by age bin and gender
fig = px.bar(
    stroke_rates,
    x='age_group',
    y='stroke',
    color='is_male',
    barmode='group',
    title='Stroke Rate by Age Group and Gender',
    labels={'stroke': 'Stroke Rate', 'age_group': 'Age Group'}
)

fig.update_layout(width=800, height=600)
fig.show()

In [None]:
# Group by age group and gender to compute heart_disease rate
disease_rates = df.groupby(['age_group', 'is_male'],observed=True)['heart_disease'].mean().reset_index()

# Step 3: Plot heart disease rate by age bin and gender
fig = px.bar(
    disease_rates,
    x='age_group',
    y='heart_disease',
    color='is_male',
    barmode='group',
    title='Heart disease Rate by Age Group and Gender',
    labels={'heart_disease': 'Heart disease Rate', 'age_group': 'Age Group'}
)

fig.update_layout(width=800, height=600)
fig.show()

In [None]:
# Group by age group and gender to compute diabetes rate
diabetes_rates = df.groupby(['age_group', 'is_male'],observed=True)['diabetes'].mean().reset_index()

# Step 3: Plot diabetes rate by age bin and gender
fig = px.bar(
    diabetes_rates,
    x='age_group',
    y='diabetes',
    color='is_male',
    barmode='group',
    title='Diabetes Rate by Age Group and Gender',
    labels={'diabetes': 'Diabetes Rate', 'age_group': 'Age Group'}
)

fig.update_layout(width=800, height=600)
fig.show()

In [None]:
# Group by age group and gender to compute hypertension rate
hypertension_rates = df.groupby(['age_group', 'is_male'],observed=True)['hypertension'].mean().reset_index()

# Step 3: Plot hypertension rate by age bin and gender
fig = px.bar(
    hypertension_rates,
    x='age_group',
    y='hypertension',
    color='is_male',
    barmode='group',
    title='Hypertension Rate by Age Group and Gender',
    labels={'hypertension': 'Hypertension Rate', 'age_group': 'Age Group'}
)

fig.update_layout(width=800, height=600)
fig.show()

In [None]:
numeric_cols_for_grouped = df.select_dtypes(include=['number', 'bool']).columns.drop(['age'])
grouped = df.groupby(['age_group', 'is_male'])[numeric_cols_for_grouped].agg(['sum','mean', 'median', 'std'])

grouped.to_csv('grouped_summary.csv', index=False)





# Data distribution via scatter plots

In [None]:
px.scatter(df, x='age', y='avg_glucose',color='stroke' , facet_col='smoking_status')

In [None]:
px.scatter(df, x='age', y='avg_glucose',color='stroke' , facet_col='ses')

In [None]:
px.scatter(df, x='age', y='bmi',color='stroke' , facet_col='ses')

In [None]:
px.scatter(df, x='age', y='avg_glucose',color='stroke' , facet_col='is_male')

# A note about scatter plots
* Although there is no apparent linear relationship in the data, coloring the stroke patients reveals the presence of two distinct populations.


#Conclusion

* Approximately 29.8% of the population experiences a stroke, with 50.0% being women and 50.0% being men.

* Among patients, 52.8% of individuals with diabetes and 48.8% of those with heart disease have suffered a stroke.

* There is a clear positive correlation between increasing age and hypertension with the occurrence of stroke.

* No significant correlation between gender and stroke occurrence was identified.

* Stroke incidence is positively correlated with older age, hypertension, and heart disease. A weaker correlation was noted between stroke and both average glucose levels and diabetes.

* Although we would expect higher average glucose levels among diabetic patients, the observed weaker correlation could be explained by effective treatment and dietary management practices, as many diabetic patients control blood sugar levels through medication and lifestyle adjustments.

* Surprisingly, no correlation was found between BMI and either heart disease or stroke. This may be because BMI measurements rely solely on weight and do not differentiate between muscle mass and body fat.

* Most individuals who experienced a stroke belonged to the medium socioeconomic status (SES) group, followed by the low SES group. However, this pattern mirrors the overall distribution of SES in the dataset — with 51% of individuals classified as medium SES, 29% as low, and 20% as high. When controlling for group sizes, the proportion of stroke cases within each SES group does not show significant variation. Therefore, SES does not appear to be a major determinant of stroke risk in this population

* Among individuals who suffered a stroke, the majority were never smokers (17% of the population), followed by current smokers (8%) and former smokers (5%). This reflects the general smoking distribution in the population, where never smokers form the largest group. However, when normalized by smoking status, a clearer pattern emerges: current smokers have the highest stroke rate (37%), compared to 28% in never smokers and 27% in former smokers. This suggests that current smoking is associated with a higher relative risk of stroke compared to those who quit or never smoked. The similar stroke rates among former and never smokers may indicate potential recovery of risk following smoking cessation

# Conclusions from Gender Analysis

* Gender Distribution:
The distribution of males and females across all age groups appears relatively balanced, with no significant gender imbalances observed.

* Stroke Rates:
Stroke prevalence is notably higher among men aged 40–49. In older age groups, stroke rates become comparable between men and women, indicating no substantial gender differences.

* Heart Disease Trends:
Heart disease is more prevalent among men aged 40–49 and remains slightly more common among men aged 50–69. However, for individuals aged 70 and above, women experience higher rates of heart disease.

* Diabetes Patterns:
Diabetes is more prevalent among women in the age groups 40–49 and 70+. Conversely, men exhibit higher diabetes prevalence between ages 50–69.

* Hypertension Patterns:
Hypertension is more frequent among men aged 40–49. In older age categories, gender differences diminish, resulting in a more balanced distribution.

* Men tend to experience health issues at younger ages compared to women, aligning with established knowledge that women generally have longer lifespans. The observed increase in health issues among older women may reflect higher mortality rates among men, leading to healthier men remaining in the older study groups, thus influencing the data distribution.



# Optional additional ways to lower data memory

In [None]:
# df['is_male'] = df['gender'].str.lower().map({'male': 1, 'female': 0}).astype(bool)
# df['hypertension'] = df['hypertension'].astype(bool)
# df['diabetes'] = df['diabetes'].map({'diabetes': 1, 'no_diabetes': 0}).astype(bool)
# df['stroke'] = df['stroke'].map({'stroke': 1, 'no_stroke': 0}).astype(bool)
# df['heart_disease'] = df['heart_disease'].map({'heart_disease': 1, 'no_heart_disease': 0}).astype(bool)
# df.drop(columns=['gender'], inplace=True)
# df = pd.get_dummies(df, columns=['smoking_status', 'ses'], drop_first=True, dtype=int)

# Fun try with Y-data library

In [None]:
!pip install ydata_profiling

Collecting ydata_profiling
  Downloading ydata_profiling-4.16.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting visions<0.8.2,>=0.7.5 (from visions[type_image_path]<0.8.2,>=0.7.5->ydata_profiling)
  Downloading visions-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata_profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik<0.13,>=0.11.1 (from ydata_profiling)
  Downloading phik-0.12.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata_profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting imagehash==4.3.1 (from ydata_profiling)
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting dacite>=1.8 (from ydata_profiling)
  Downloading dacite-1.9.2-py3-none-any.whl.metadata (17 kB)
Collecting PyWavelets (from imagehash==4.3.1->ydata_profiling)
  Downloading pywavelets-1.

In [None]:
from ydata_profiling import ProfileReport
profile=ProfileReport(df,title='Profiling Report')

In [None]:
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:00<00:01,  6.12it/s][A
 20%|██        | 2/10 [00:00<00:01,  5.51it/s][A
 50%|█████     | 5/10 [00:00<00:00, 11.07it/s][A
100%|██████████| 10/10 [00:00<00:00, 14.56it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

