In [53]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.stats import ttest_ind
from scipy.stats import chi2_contingency

In [10]:
diabetes_dataset_to_load = Path("Diabetes/Resources/diabetes_prediction_dataset.csv")
diabetes_dataset = pd.read_csv(diabetes_dataset_to_load)
diabetes_dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [11]:
diabetes_dataset.dtypes

gender                  object
age                    float64
hypertension             int64
heart_disease            int64
smoking_history         object
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
diabetes                 int64
dtype: object

## FEMALE DIABETES & HEART DISEASE Vs. FEMALE NO DIABETES & NO HEART DISEASE (BMI)

In [31]:
female_diabetes_hd = diabetes_dataset[(diabetes_dataset["gender"] == "Female") & (diabetes_dataset["heart_disease"] == 1) & (diabetes_dataset["diabetes"] == 1)]

print(female_diabetes_hd.head())

size = female_diabetes_hd.count()

print(size)


      gender   age  hypertension  heart_disease smoking_history    bmi  \
618   Female  59.0             0              1           never  60.26   
1022  Female  62.0             1              1          former  44.23   
1135  Female  62.0             1              1           never  43.16   
1274  Female  76.0             0              1          former  25.68   
1277  Female  67.0             1              1         current  28.52   

      HbA1c_level  blood_glucose_level  diabetes  
618           8.8                  145         1  
1022          8.2                  145         1  
1135          8.8                  280         1  
1274          9.0                  260         1  
1277          6.5                  200         1  
gender                 526
age                    526
hypertension           526
heart_disease          526
smoking_history        526
bmi                    526
HbA1c_level            526
blood_glucose_level    526
diabetes               526
dtype:

In [32]:
female_ndiabetes_nhd = diabetes_dataset[(diabetes_dataset["gender"] == "Female") & (diabetes_dataset["heart_disease"] == 0) & (diabetes_dataset["diabetes"] == 0)]
print(female_ndiabetes_nhd.head())

size2 = female_ndiabetes_nhd.count()

print(size2)

   gender   age  hypertension  heart_disease smoking_history    bmi  \
1  Female  54.0             0              0         No Info  27.32   
3  Female  36.0             0              0         current  23.45   
5  Female  20.0             0              0           never  27.32   
7  Female  79.0             0              0         No Info  23.86   
9  Female  32.0             0              0           never  27.32   

   HbA1c_level  blood_glucose_level  diabetes  
1          6.6                   80         0  
3          5.0                  155         0  
5          6.6                   85         0  
7          5.7                   85         0  
9          5.0                  100         0  
gender                 53055
age                    53055
hypertension           53055
heart_disease          53055
smoking_history        53055
bmi                    53055
HbA1c_level            53055
blood_glucose_level    53055
diabetes               53055
dtype: int64


In [20]:
average_bmi_female_diabetes_hd = female_diabetes_hd["bmi"].mean()
average_bmi_female_diabetes_hd

31.214733840304184

In [43]:
variance_1 = np.var(female_diabetes_hd["bmi"])
print(variance_1)

variance_2 = np.var(female_ndiabetes_nhd["bmi"])
print(variance_2)

54.77507473904487
43.75931786316361


In [22]:
average_bmi_female_ndiabetes_nhd = female_ndiabetes_nhd["bmi"].mean()
average_bmi_female_ndiabetes_nhd

26.99850758646687

In [37]:
t_statistic = ttest_ind(female_diabetes_hd["bmi"], female_ndiabetes_nhd["bmi"], equal_var=False)
t_statistic

TtestResult(statistic=13.001760605741776, pvalue=9.09708986464371e-34, df=533.3332443395307)

## FEMALE DIABETES VS MALE DIABETES (Average Age)

In [49]:
female_diabetes = diabetes_dataset[(diabetes_dataset["gender"] == "Female") & (diabetes_dataset["diabetes"] == 1)]
print(female_diabetes.head())

female_diabetes_size = (female_diabetes.count())
print(female_diabetes_size)

female_age_average = female_diabetes["age"].mean()
print(female_age_average)

    gender   age  hypertension  heart_disease smoking_history    bmi  \
6   Female  44.0             0              0           never  19.31   
53  Female  53.0             0              0          former  27.32   
59  Female  67.0             0              0           never  63.48   
87  Female  36.0             0              0         current  32.27   
94  Female  60.0             0              0           never  27.32   

    HbA1c_level  blood_glucose_level  diabetes  
6           6.5                  200         1  
53          7.0                  159         1  
59          8.8                  155         1  
87          6.2                  220         1  
94          7.5                  300         1  
gender                 4461
age                    4461
hypertension           4461
heart_disease          4461
smoking_history        4461
bmi                    4461
HbA1c_level            4461
blood_glucose_level    4461
diabetes               4461
dtype: int64
60.99349

In [50]:
male_diabetes = diabetes_dataset[(diabetes_dataset["gender"] == "Male") & (diabetes_dataset["diabetes"] == 1)]
print(male_diabetes.head())

male_diabetes_size = (male_diabetes.count())
print(male_diabetes_size)

male_age_average = male_diabetes["age"].mean()
print(male_age_average)

   gender   age  hypertension  heart_disease smoking_history    bmi  \
26   Male  67.0             0              1     not current  27.32   
38   Male  50.0             1              0         current  27.32   
40   Male  73.0             0              0          former  25.91   
55   Male  50.0             0              0          former  37.16   
81   Male  57.0             0              0         No Info  27.32   

    HbA1c_level  blood_glucose_level  diabetes  
26          6.5                  200         1  
38          5.7                  260         1  
40          9.0                  160         1  
55          9.0                  159         1  
81          8.2                  126         1  
gender                 4039
age                    4039
hypertension           4039
heart_disease          4039
smoking_history        4039
bmi                    4039
HbA1c_level            4039
blood_glucose_level    4039
diabetes               4039
dtype: int64
60.89477593463

In [52]:
fvariance_1 = np.var(female_diabetes["age"])
print(fvariance_1)

mvariance_2 = np.var(male_diabetes["age"])
print(mvariance_2)

222.05129152146074
200.2075463659864


In [51]:
t_statistic = ttest_ind(male_diabetes["age"], female_diabetes["age"], equal_var=False)
t_statistic

TtestResult(statistic=-0.3131814061917626, pvalue=0.7541505473311234, df=8478.756688315229)

## Diabetes Predicting Heart Disease

In [69]:
diabetes_hd = diabetes_dataset[(diabetes_dataset["diabetes"] == 1) & (diabetes_dataset["heart_disease"] == 1)]
print(diabetes_hd)

diabetes_nhd = diabetes_dataset[(diabetes_dataset["diabetes"] == 1) & (diabetes_dataset["heart_disease"] == 0)]
print(diabetes_nhd)

       gender   age  hypertension  heart_disease smoking_history    bmi  \
26       Male  67.0             0              1     not current  27.32   
242      Male  57.0             1              1     not current  27.77   
361      Male  80.0             0              1          former  24.36   
566      Male  75.0             0              1     not current  28.12   
590      Male  69.0             0              1          former  24.10   
...       ...   ...           ...            ...             ...    ...   
99723    Male  61.0             1              1          former  35.78   
99740    Male  80.0             0              1           never  25.76   
99826    Male  63.0             0              1         No Info  27.32   
99935  Female  65.0             1              1           never  33.55   
99938    Male  55.0             0              1          former  30.42   

       HbA1c_level  blood_glucose_level  diabetes  
26             6.5                  200        

In [70]:
related = chi2_contingency(diabetes_hd, diabetes_nhd)
related


TypeError: '<' not supported between instances of 'str' and 'int'