# `Predicting Stroke Occurance Using KNN Classification` #
Gael Hernandez Palmer, Asha Octoman, Christina Yang, Julius Brooker

## Introduction ##

#### *Background Information:* #### 
...

#### *Exploratory Question:* ####
...

#### *Dataset:* ####
...


## Preliminary Data Analysis ##

In [1]:
# loading packages
library(repr)
library(tidyverse)
library(tidymodels)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

“package ‘ggplot2’ was built under R version 4.0.1”
“package ‘tibble’ was built under R version 4.0.2”
“package ‘tidyr’ was built under R version 4.0.2”
“package ‘dplyr’ was built under R version 4.0.2”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

“package ‘tidymodels’ was built under R version 4.0.2”
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 0.1.1 ──

[32m✔

In [2]:
# loading data
stroke <- read_csv("data/stroke-data.csv")
colnames(stroke)

Parsed with column specification:
cols(
  id = [32mcol_double()[39m,
  gender = [31mcol_character()[39m,
  age = [32mcol_double()[39m,
  hypertension = [32mcol_double()[39m,
  heart_disease = [32mcol_double()[39m,
  ever_married = [31mcol_character()[39m,
  work_type = [31mcol_character()[39m,
  Residence_type = [31mcol_character()[39m,
  avg_glucose_level = [32mcol_double()[39m,
  bmi = [31mcol_character()[39m,
  smoking_status = [31mcol_character()[39m,
  stroke = [32mcol_double()[39m
)



In [51]:
# cleaning and wrangling
stroke_clean <- stroke %>%
    select(gender, age, hypertension, heart_disease, avg_glucose_level, bmi, smoking_status, stroke) %>%
    mutate(gender = as_factor(gender), 
           bmi = as.numeric(bmi),
           smoking_status = as_factor(smoking_status),
           stroke = as_factor(stroke))

“Problem with `mutate()` input `bmi`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `bmi` is `as.numeric(bmi)`.”
“NAs introduced by coercion”


In [52]:
head(stroke_clean)

gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<fct>
Male,67,0,1,228.69,36.6,formerly smoked,1
Female,61,0,0,202.21,,never smoked,1
Male,80,0,1,105.92,32.5,never smoked,1
Female,49,0,0,171.23,34.4,smokes,1
Female,79,1,0,174.12,24.0,never smoked,1
Male,81,0,0,186.21,29.0,formerly smoked,1


In [53]:
num_obs <- nrow(stroke_clean)
stroke_count <- stroke_clean %>%
  group_by(stroke) %>%
  summarize(
    count = n(),
    percentage = n() / num_obs * 100
  )
stroke_count

`summarise()` ungrouping output (override with `.groups` argument)



stroke,count,percentage
<fct>,<int>,<dbl>
0,4861,95.127202
1,249,4.872798


In [54]:
# data splitting
set.seed(1)

stroke_split <- initial_split(stroke_clean, prop = 0.75, strata = stroke)
stroke_train <- training(stroke_split)
stroke_test <- testing(stroke_split)

In [55]:
stroke_train_proportions <- stroke_train %>%
                      group_by(stroke) %>%
                      summarize(n = n()) %>%
                      mutate(percent = 100*n/nrow(stroke_train))
stroke_test_proportions <- stroke_test %>%
                      group_by(stroke) %>%
                      summarize(n = n()) %>%
                      mutate(percent = 100*n/nrow(stroke_test))
stroke_train_proportions
stroke_test_proportions

`summarise()` ungrouping output (override with `.groups` argument)

`summarise()` ungrouping output (override with `.groups` argument)



stroke,n,percent
<fct>,<int>,<dbl>
0,3656,95.382207
1,177,4.617793


stroke,n,percent
<fct>,<int>,<dbl>
0,1205,94.361785
1,72,5.638215


In [56]:
# data analysis and visualization

In [57]:
stroke_recipe <- recipe(stroke ~ ., data = stroke_train) %>%
    step_rm(gender, smoking_status) %>%
    step_scale(all_predictors()) %>%
    step_center(all_predictors())

In [58]:
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) %>%
  set_engine("kknn") %>%
  set_mode("classification")

In [59]:
# fit the knn model
knn_fit <- workflow() %>%
  add_recipe(stroke_recipe) %>%
  add_model(knn_spec) %>%
  fit(data = stroke_train)

In [60]:
# get predictions on the validation data
validation_predicted <- predict(knn_fit, stroke_test) %>%
  bind_cols(stroke_test)

ERROR: Error: Can't recycle `..1` (size 1233) to match `..2` (size 1277).


In [None]:
# compute the accuracy
acc <- validation_predicted %>%
  metrics(truth = stroke, estimate = .pred_class) %>%
  filter(.metric == "accuracy") %>%
  select(.estimate) %>%
  pull()

acc

## Methods ##
...

## Expected Outcomes ##
...

## Significance ##
...