# Chapter 1 

## Exercise 1

This simple data analysis pipeline uses a [decision tree](https://en.wikipedia.org/wiki/Decision_tree_learning) model to classify flowers from the famous [iris dataset](https://archive.ics.uci.edu/ml/datasets/iris).

Questions:

1. Look through the example pipeline. Mark which cells belong to which categories in this categorization:
    - Data perparation and loading
    - Modeling
    - Evaluation
    - Presentation
2. Try to determine the pieces of the code that are **unique** to this specific dataset. Discuss how you would generalize this pipeline to other datasets.

In [None]:
# Cell 1
library(tidyverse)
library(broom)
library(modelr)
library(rpart)
library(caret)

In [None]:
# Cell 2
iris <- read_csv('../data/iris.data', col_names=c('Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species')) %>%
    rowid_to_column('id') %>%
    mutate(Species=str_remove(Species, 'Iris-')) %>%
    mutate(Species=as.factor(Species))
print(iris)

In [None]:
# Cell 3
set.seed(42)
sample_ix <- sample(nrow(iris))
iris <- iris[sample_ix,]
head(iris)
iris <- select(iris, -id)
head(iris)

In [None]:
iris_split <- iris %>%
    resample_partition(c(train=0.8, test=0.2))

iris_split$train %>%
    as_tibble() %>%
    group_by(Species) %>%
    tally()

iris_split$test %>%
    as_tibble() %>%
    group_by(Species) %>%
    tally()

In [None]:
fitted_tree <- iris_split$train %>%
    as_tibble() %>%
    rpart(Species ~ ., data=., method='class')
fitted_tree

In [None]:
iris_fitted <- iris_split$test %>%
    as_tibble() %>%
    mutate(Prediction = predict(fitted_tree, newdata=., type='class')) %>%
    mutate(tree_failure=Species != Prediction)

In [None]:
confusionMatrix(iris_fitted$Species, iris_fitted$Prediction)

In [None]:
options(repr.plot.width=12, repr.plot.height=5)
iris_fitted %>%
    ggplot(aes(x=Petal.Width, y=Petal.Length, color=tree_failure)) +
        geom_point(shape=1)
iris_fitted %>%
    ggplot(aes(x=Sepal.Width, y=Sepal.Length, color=tree_failure)) +
        geom_point(shape=1)