# Diabetes Analysis

### Author: Ly Duc Trung

### Link: https://github.com/DucTrung1802/DS/blob/main/GLM_Report/diabetes_analysis.ipynb

### Dataset Link: https://www.archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators

## 0. Install and import libraries

### Check R version

In [None]:
R.Version()

### 0.1. Install

In [None]:
# INSTALL THESE PACKAGES IN CONDA TERMINAL
# conda install r-corrplot
# conda install r-rcompanion

### 0.2. Import

In [None]:
library(corrplot)
library(rcompanion)

## 1. Import Data

In [None]:
input_data = read.csv("diabetes_full_data.csv")

In [None]:
df = data.frame(input_data)

In [None]:
df

## 2. Data Preprocessing

### 2.0. Standardize column names

In [None]:
colnames(df)
# Column "X" should be changed to be a more meaningful name.

In [None]:
colnames(df)[1] = "Index"

In [None]:
colnames(df)

### 2.1. Mismatched data types

In [None]:
str(df)

In [None]:
# All columns have correct data type

### 2.2. Mixed data values

In [None]:
# Example: (female, woman, man, male) => (male, female)

In [None]:
# This dataset contains only number data => No mixed data values

### 2.3. Missing data

#### 2.3.1 Null data

In [None]:
sum(is.na(df))

In [None]:
# This dataset has no missing values

#### 2.3.2 Missing data in specific columns

In [None]:
# We see that in column "Age", the value could have value of 14 
# which is Don’t know / Refused / Missing (_AGEG5YR - BRFSS)

In [None]:
nrow(subset(df, Age == 14))

In [None]:
# Conclusion: The dataset has no missing data.

In [None]:
str(df)

## 3. Exploratory data analysis (EDA)

### 3.1. Response variable analysis

In [None]:
barplot(table(df$Diabetes_binary))

### 3.2. Numeric variables analysis

In [None]:
# Numeric variables (3): BMI, MentHlth, PhysHlth

#### 3.2.1. Overview

In [None]:
par(mfrow=c(1,3))
boxplot(df$BMI, main="BMI")
boxplot(df$MentHlth, main="MentHlth")
boxplot(df$PhysHlth, main="PhysHlth")

In [None]:
par(mfrow=c(3,1))
hist(df$BMI, main="BMI", xlab="BMI score")
hist(df$MentHlth, main="MentHlth", xlab="Number of day with Mental Health problem in last 30 days")
hist(df$PhysHlth, main="PhysHlth", xlab="Number of day with Physical Health problem in last 30 days")

#### 3.2.1. BMI

In [None]:
summary(df$BMI)

In [None]:
# We see maximum value of BMI is 98 which is abnormal.

In [None]:
# Boxplot data
boxplot(df$BMI)

In [None]:
# We see that dataset has several outliers in "BMI" column.

In [None]:
BMI_q1 = quantile(df$BMI, 0.25)
BMI_q3 = quantile(df$BMI, 0.75)
BMI_iqr = BMI_q3 - BMI_q1
BMI_lower_bound = BMI_q1 - 1.5 * BMI_iqr
BMI_upper_bound = BMI_q3 + 1.5 * BMI_iqr

In [None]:
BMI_outlier_df = df[df$BMI < BMI_lower_bound | df$BMI > BMI_upper_bound, ]

In [None]:
nrow(BMI_outlier_df) / nrow(df) * 100

In [None]:
# We see that the percentage of outliers is approximately about 4%.
# Extreme values of BMI that higher 40 could affect the model.

# To standardize the BMI index, WHO recommendations is used.
# Reference: https://www.who.int/europe/news-room/fact-sheets/item/a-healthy-lifestyle---who-recommendations

# |   BMI     | Nutritional Status | Categorical |
# |-----------|--------------------|-------------|
# | < 18.5    | Underweight        |      1      |
# | 18.5–24.9 | Normal weight      |      2      |
# | 25.0–29.9 | Pre-obesity        |      3      |
# | 30.0–34.9 | Obesity class I    |      4      |
# | 35.0–39.9 | Obesity class II   |      5      |
# | > 40      | Obesity class III  |      6      |

new_column = rep(0, nrow(df))
BMI_index = which(colnames(df) == "BMI")
df_std_1 = cbind(df[,1:BMI_index], StdBMI = new_column, df[,(BMI_index + 1):ncol(df)])

In [None]:
df_std_1

In [None]:
df_std_1[df_std_1$BMI < 18.5, ]$StdBMI = 1
df_std_1[df_std_1$BMI >= 18.5 & df_std_1$BMI < 25, ]$StdBMI = 2
df_std_1[df_std_1$BMI >= 25 & df_std_1$BMI < 30, ]$StdBMI = 3
df_std_1[df_std_1$BMI >= 30 & df_std_1$BMI < 35, ]$StdBMI = 4
df_std_1[df_std_1$BMI >= 35 & df_std_1$BMI < 40, ]$StdBMI = 5
df_std_1[df_std_1$BMI >= 40, ]$StdBMI = 6

In [None]:
df_std_1

In [None]:
table(df_std_1$StdBMI)

In [None]:
barplot(table(df_std_1$StdBMI), main="BMI groups (WHO recommendation)")

In [None]:
str(df_std_1)

In [None]:
# Comment: StdBMI data are categorical.

In [None]:
# We now must validate the meaningful of this standardizatiton with response variable.

In [None]:
table(df_std_1$StdBMI, df_std_1$Diabetes_binary)

In [None]:
chisq.test(table(df_std_1$StdBMI, df_std_1$Diabetes_binary))

In [None]:
# Comment: A p-value close to 0 shows that there is a difference between diabetes rates among BMI groups.

#### BMI output: df_std_1

In [None]:
str(df_std_1)

#### 3.2.2. MentHlth

In [None]:
summary(df_std_1$MentHlth)

In [None]:
boxplot(df_std_1$MentHlth)

In [None]:
barplot(table(df_std_1$MentHlth))

In [None]:
# We see data with value ZERO is overwhelmed compared to other values.
# Create a binary feature indicates that whether a person has mental health problem.

In [None]:
new_column = rep(0, nrow(df_std_1))
MentHlth_index = which(colnames(df_std_1) == "MentHlth")
df_std_2 = cbind(df_std_1[,1:MentHlth_index], BoolMentHlth = new_column, df_std_1[,(MentHlth_index+1):ncol(df_std_1)])

In [None]:
df_std_2[df_std_2$MentHlth > 0, ]$BoolMentHlth = 1

In [None]:
barplot(table(df_std_2$BoolMentHlth), main="BoolMentHlth")

In [None]:
chisq.test(table(df_std_2$BoolMentHlth, df_std_2$Diabetes_binary))

In [None]:
# Comment: A p-value close to 0 shows that there is a difference between diabetes rates among BoolMentHlth groups.

#### MentHlth output: df_std_2

In [None]:
str(df_std_2)

#### 3.2.3. PhysHlth

In [None]:
summary(df_std_2$PhysHlth)

In [None]:
boxplot(df_std_1$PhysHlth)

In [None]:
barplot(table(df_std_2$PhysHlth))

In [None]:
# We see data with value ZERO is overwhelmed compared to other values.
# Create a binary feature indicates that whether a person has physical health problem.

In [None]:
new_column = rep(0, nrow(df_std_2))
PhysHlth_index = which(colnames(df_std_2) == "PhysHlth")
df_std_3 = cbind(df_std_2[,1:PhysHlth_index], BoolPhysHlth = new_column, df_std_2[,(PhysHlth_index+1):ncol(df_std_2)])

In [None]:
df_std_3[df_std_3$PhysHlth > 0, ]$BoolPhysHlth = 1

In [None]:
barplot(table(df_std_3$BoolPhysHlth), main="BoolPhysHlth")

In [None]:
chisq.test(table(df_std_3$BoolPhysHlth, df_std_3$Diabetes_binary))

In [None]:
# Comment: A p-value close to 0 shows that there is a difference between diabetes rates among BoolPhysHlth groups.

#### PhysHlth output: df_std_3

In [None]:
str(df_std_3)

### 3.3. Categorical variables analysis (except "StdBMI", "BoolMentHlth", "BoolPhysHlth")

In [None]:
# Categorical variables (18): HighBP, HighChol, CholCheck, Smoker, Stroke,
# HeartDiseaseorAttack, PhysActivity, Fruits, Veggies, HvyAlcoholConsump,
# AnyHealthcare, NoDocbcCost, GenHlth, DiffWalk, Sex, Age, Education, Income.

In [None]:
# Plot first 9 categorical variables: 
# HighBP, HighChol, CholCheck, Smoker, Stroke,
# HeartDiseaseorAttack, PhysActivity, Fruits, Veggies,
par(mfrow=c(3,3))
barplot(table(df$HighBP), horiz=T, main="HighBP")
barplot(table(df$HighChol), horiz=T, main="HighChol")
barplot(table(df$CholCheck), horiz=T, main="CholCheck")
barplot(table(df$Smoker), horiz=T, main="Smoker")
barplot(table(df$Stroke), horiz=T, main="Stroke")
barplot(table(df$HeartDiseaseorAttack), horiz=T, main="HeartDiseaseorAttack")
barplot(table(df$PhysActivity), horiz=T, main="PhysActivity")
barplot(table(df$Fruits), horiz=T, main="Fruits")
barplot(table(df$Veggies), horiz=T, main="Veggies")

In [None]:
# Plot last 9 categorical variables
# HvyAlcoholConsump, AnyHealthcare, NoDocbcCost, GenHlth, 
# DiffWalk, Sex, Age, Education, Income.
par(mfrow=c(3,3))
barplot(table(df$HvyAlcoholConsump), horiz=T, main="HvyAlcoholConsump")
barplot(table(df$AnyHealthcare), horiz=T, main="AnyHealthcare")
barplot(table(df$NoDocbcCost), horiz=T, main="NoDocbcCost")
barplot(table(df$GenHlth), horiz=T, main="GenHlth")
barplot(table(df$DiffWalk), horiz=T, main="DiffWalk")
barplot(table(df$Sex), horiz=T, main="Sex")
barplot(table(df$Age), horiz=T, main="Age")
barplot(table(df$Education), horiz=T, main="Education")
barplot(table(df$Income), horiz=T, main="Income")

### 3.4. Correlation analysis

In [None]:
numeric_col = c("BMI", "MentHlth", "PhysHlth")

In [None]:
length(numeric_col)

In [None]:
binary_col = c("HighBP", "HighChol", "CholCheck", "Smoker", "Stroke",
               "HeartDiseaseorAttack", "PhysActivity", "Fruits", "Veggies",
               "HvyAlcoholConsump", "AnyHealthcare", "NoDocbcCost",
              "BoolMentHlth", "BoolPhysHlth", "DiffWalk", "Sex")

In [None]:
length(binary_col)

In [None]:
ordinal_col = c("StdBMI", "GenHlth", "Age", "Education", "Income")

In [None]:
length(ordinal_col)

In [None]:
response_col = "Diabetes_binary"

#### 3.4.1. Numeric variable analysis

In [None]:
corrplot(cor(df[,numeric_col]), method="number", type="upper")

In [None]:
corrplot(cor(df[,numeric_col]), method="circle", type="upper")

In [None]:
# Comment: We see no multicollinearity here.

#### 3.4.2. Numeric variable analysis vs response variable

In [None]:
cor.test(df_std_3$BMI, df_std_3$Diabetes_binary)$estimate

In [None]:
cor.test(df_std_3$StdBMI, df_std_3$Diabetes_binary, method = "kendall")

## 4. Model building

## TEST

In [None]:
x <- c(0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0)
y <- c(0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0)

In [None]:
cor.test(x,y)

In [None]:
data = matrix(c(6, 9, 8, 5, 12, 10), nrow=2)

In [None]:
cramerV(data)