# CS6140 Assignments

**Instructions**
1. In each assignment cell, look for the block:
 ```
  #BEGIN YOUR CODE
  raise NotImplementedError.new()
  #END YOUR CODE
 ```
1. Replace this block with your solution.
1. Test your solution by running the cells following your block (indicated by ##TEST##)
1. Click the "Validate" button above to validate the work.

**Notes**
* You may add other cells and functions as needed
* Keep all code in the same notebook
* In order to receive credit, code must "Validate" on the JupyterHub server

---

# Assignment 2: Decision Trees (1)

---
** Setup **

In [3]:
require 'test/unit/assertions'
require 'daru'
require 'distribution'
require 'json'

include Test::Unit::Assertions

## Loads data files
def read_sparse_data_from_csv prefix
  data = []
  classes = Hash.new {|h,k| h[k] = 0}
  header = File.read(prefix + ".header").chomp.split(",")  
  
  File.open(prefix + ".csv").each_line.with_index do |l, i|
    a = l.chomp.split ","
    next if a.empty?
    row = {"features" => Hash.new}
    
    header.each.with_index do |k, i|
      v = a[i].to_f
      if k == "label"
        row["label"] = v.to_i
      else
        next if v.zero?
        row["features"][k] = v
      end
    end
    classes[row["label"]] += 1
    
    data << row
  end
  return {"classes" => classes, "features" => header[0,header.size - 1], "data" => data}
end

"if(window['d3'] === undefined ||\n   window['Nyaplot'] === undefined){\n    var path = {\"d3\":\"https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min\",\"downloadable\":\"http://cdn.rawgit.com/domitry/d3-downloadable/master/d3-downloadable\"};\n\n\n\n    var shim = {\"d3\":{\"exports\":\"d3\"},\"downloadable\":{\"exports\":\"downloadable\"}};\n\n    require.config({paths: path, shim:shim});\n\n\nrequire(['d3'], function(d3){window['d3']=d3;console.log('finished loading d3');require(['downloadable'], function(downloadable){window['downloadable']=downloadable;console.log('finished loading downloadable');\n\n\tvar script = d3.select(\"head\")\n\t    .append(\"script\")\n\t    .attr(\"src\", \"http://cdn.rawgit.com/domitry/Nyaplotjs/master/release/nyaplot.js\")\n\t    .attr(\"async\", true);\n\n\tscript[0][0].onload = script[0][0].onreadystatechange = function(){\n\n\n\t    var event = document.createEvent(\"HTMLEvents\");\n\t    event.initEvent(\"load_nyaplot\",false,false);\n\t    win

:read_sparse_data_from_csv

In [3]:
#Preview 2 lines from the Iris dataset
iris = read_sparse_data_from_csv "iris"
iris["data"][0,2]

[{"features"=>{"sepal_length"=>7.0, "sepal_width"=>3.2, "petal_length"=>4.7, "petal_width"=>1.4}, "label"=>1}, {"features"=>{"sepal_length"=>5.7, "sepal_width"=>3.0, "petal_length"=>4.2, "petal_width"=>1.2}, "label"=>1}]

In [4]:
#Preview 2 lines from the Spambase dataset
spambase = read_sparse_data_from_csv "spambase"
spambase["data"][0,2]

[{"features"=>{"word_freq_our"=>0.27, "word_freq_mail"=>0.83, "word_freq_you"=>0.27, "word_freq_your"=>0.27, "word_freq_font"=>8.58, "char_freq_["=>0.092, "char_freq_$"=>0.185, "char_freq_#"=>0.232, "capital_run_length_average"=>7.313, "capital_run_length_longest"=>99.0, "capital_run_length_total"=>607.0}, "label"=>1}, {"features"=>{"word_freq_your"=>0.9, "word_freq_george"=>0.9, "word_freq_data"=>0.9, "char_freq_["=>0.14, "capital_run_length_average"=>3.472, "capital_run_length_longest"=>28.0, "capital_run_length_total"=>125.0}, "label"=>0}]

---
## Question 1.1 (10 Points)
Write a function that calculates the class distribution, $p(c)$, for all classes $c$ for a dataset. 

In [None]:
def class_distribution dataset
  # BEGIN YOUR CODE
  output = Hash.new {|h,k| h[k] = 0}
  
  for data in dataset
    output[data["label"]] += 1
  end

  sum = 0.0
  
  output.each_value do |v|
    sum += v
  end
  
  output.each {|k,v| output[k] = v /sum}
  
  return output  
  #END YOUR CODE
end

In [None]:
### TESTS ###
# Check that there are three classes
t1_iris_dist = class_distribution iris["data"]
t1_iris_num_classes = 3
assert_equal(t1_iris_num_classes, t1_iris_dist.size)

In [None]:
### TESTS ###
# Check that each class has a probability 1/3
t1_iris_dist = class_distribution iris["data"]
t1_iris_num_classes = 3
t1_iris_num_classes.times do |cls|
  assert_in_delta t1_iris_dist[cls], 0.33333, 1e-4
end

In [None]:
### TESTS ###
t1_spambase_dist = class_distribution spambase["data"]
t1_spambase_num_classes = 2
assert_equal(t1_spambase_num_classes, t1_spambase_dist.size)

In [None]:
### TESTS ###
t1_spambase_dist = class_distribution spambase["data"]
assert_in_delta t1_spambase_dist[0], 0.6, 1e-1
assert_in_delta t1_spambase_dist[1], 0.4, 1e-1

## Question 2 (20 Points)
Adapting your code for calculating the entropy from [Assignment 1](../assignment-1/assignment-1.ipynb), use input from a hash table rather than an array. 

In [9]:
def entropy dist
  # BEGIN YOUR CODE
  cEnt = 0.0
  sum = 0.0
  
  dist.each_value do |v|
    sum += v
  end
  
  if sum == 0.0
    return 0
  end
  
  dist.each_value do |v|
    prob = v/sum
    if prob < 1e-5
      next
    else
      cEnt -= prob*Math.log(prob)
    end
  end
  
  return cEnt
  #END YOUR CODE
end

:entropy

In [None]:
### TESTS ###
#Checks the class entropy for Iris dataset
t2_iris_dist = class_distribution iris["data"]
t2_iris_entropy = entropy t2_iris_dist
assert_equal 1.0986122886681096, t2_iris_entropy, 1e-4

In [None]:
### TESTS ###
#Checks the class entropy for Spambase dataset
t2_spambase_dist = class_distribution spambase["data"]
t2_spambase_entropy = entropy t2_spambase_dist
assert_equal 0.6705230209876485, t2_spambase_entropy, 1e-4

In [None]:
### TESTS ###
# Checks that code handles a class with zero members of a class
t2_zero_dist = {0 => 0.0, 1 => 107.0}
t2_zero_entropy = entropy t2_zero_dist
assert_equal 0.0, t2_zero_entropy, 1e-4

## Question 3.1 (5 points)

Implement information gain given an initial entropy $H_0$ and a hash of counts, where the key of each hash is a split criterion and the value is an array of all examples matching the split. Call the ```class_distribution``` and ```entropy``` functions you have defined above.

The formula for information gain is as follows:

## $IG(Q, V) = H_0 - \sum_{v\in V} \frac{|v|}{|V|} H(c\mid v) $ 

In [1]:
def information_gain h0, splits
  # BEGIN YOUR CODE
  sum = 0
  ig = h0
  
  splits.each_value do |v|
    sum += v.length
  end
  
  splits.each_value do |v|
    ig -= (entropy(class_distribution(v))) * v.length / sum
  end
  return ig
  #END YOUR CODE
end

:information_gain

In [4]:
### TEST: Random Split ###
t3_random_split = iris["data"].group_by {|row| rand > 0.5 ? "l" : "r"}
t3_entropy = entropy(class_distribution(iris["data"]))


NameError: undefined local variable or method `iris' for main:Object

## Question 3.2 (5 points)

The test above verifies the information gain when the iris data is split with uniform probability. What should the information gain be here if we were to do this for the **spambase** dataset?

In [None]:
## Make this function return a constant equal to your answer to the question above.
def t3_random_information_gain_spambase
  # BEGIN YOUR CODE
  spambase = read_sparse_data_from_csv "spambase"
  split = spambase["data"].group_by {|data| rand > 0.5 ? "l" : "r"}
  ent = entropy(class_distribution(spambase["data"]))
  return information_gain ent, split
  #END YOUR CODE
end

In [None]:
### TEST (Hidden) ###

## Question 3.3 (10 points)

The test below verifies the information gain of the iris data when split by class value. What should the information gain be here if we were to do this for the **spambase** dataset?

In [None]:
### TEST: Perfect split (by class)
t3_perfect_split = iris["data"].group_by {|row| row["label"]}
t3_perfect_information_gain = information_gain t3_entropy, t3_perfect_split

In [5]:
## Make this function return a constant equal to your answer to the question above.
def t3_perfect_information_gain_spambase
  # BEGIN YOUR CODE
  spambase = read_sparse_data_from_csv "spambase"
  split = spambase["data"].group_by {|data| data["label"]}
  ent = entropy(class_distribution(spambase["data"]))
  return information_gain ent, split
  #END YOUR CODE
end

:t3_perfect_information_gain_spambase

In [None]:
### TEST (Hidden) ###

## Question 4 (20 points)
Considering the only the figure below, which shows the iris dataset colored by class, what value of petal_length would have the best information gain?

In [10]:
d = iris["data"]
df = Daru::DataFrame.new({
  x1: d.collect{|r| r["features"]["petal_length"]}, 
  x2: d.collect{|r| r["features"]["sepal_width"]}, 
  label: d.collect{|r| r["label"]}
}).to_category(:label).plot(type: :scatter, x: :x1, y: :x2, categorized: {by: :label, method: :color}) do |plot, diagram|
  plot.x_label iris["features"][2]
  plot.y_label iris["features"][1]
  plot.legend true
end

In [6]:
## Make this function return a constant equal to your answer to the question above.
def t4_best_split_for_petal_length_guess
  # BEGIN YOUR CODE
  iris = read_sparse_data_from_csv "iris"
  ent = entropy(class_distribution(iris["data"]))
  
  sp = [0.0, 0.0]
  petal_length = 0.0
  while petal_length <= 7.5
    best_split = iris["data"].group_by {|data| data["features"]["petal_length"] < petal_length ? "l" : "r"}
    ig = information_gain ent, best_split
    if ig > sp[1]
      sp[0] = petal_length
      sp[1] = ig
    end
    petal_length += 0.01
  end
  return sp[0]
  #END YOUR CODE
end

:t4_best_split_for_petal_length_guess

In [7]:
### TEST (Hidden) ###

## Question 5 (20 points)
Implement a function which takes a feature name and a real-valued threshold and splits data into two groups: 
* Strictly less than the threshold
* Greater than or equal to the threholds. 

This function returns a structure that can be used to calculate the information gain. An example is as follows:
```
{"petal_length < X1":[
        {"features":{"sepal_length":4.8,"sepal_width":3.1,"petal_length":1.6,"petal_width":0.2},"label":0},
],"petal_length >= X1":[
        {"features":{"sepal_length":4.8,"sepal_width":3.1,"petal_length":1.6,"petal_width":0.2},"label":0},
]}
```

In [14]:
def split_on_numeric_value x, k, v
  # BEGIN YOUR CODE
  return x.group_by {|data| data["features"][k].to_f < v ? "l" : "r"}
  #END YOUR CODE
end

:split_on_numeric_value

In [None]:
### TEST ###

# Checks for the number of examples when split on petal_length = 1.7
t5_iris_splits = split_on_numeric_value iris["data"], "petal_length", 1.7
t5_split_sizes = t5_iris_splits.values.collect {|v| v.size}.sort
t5_num_splits = 2
assert_equal t5_num_splits, t5_split_sizes.size
assert_equal 44, t5_split_sizes[0]
assert_equal 106, t5_split_sizes[1]

In [None]:
### TEST ###

# Checks the information gain for this split
t5_iris_entropy = entropy(class_distribution(iris["data"]))
t5_iris_information_gain = information_gain t5_iris_entropy, t5_iris_splits
assert_in_delta 0.48280104455013506, t5_iris_information_gain, 5e-2

In [None]:
### TEST ###

# Checks the information gain for a split on the spambase dataset
t5_spambase_splits = split_on_numeric_value spambase["data"], "char_freq_$", 0.056
t5_spambase_entropy = entropy(class_distribution(spambase["data"]))
t5_spambase_information_gain = information_gain t5_spambase_entropy, t5_spambase_splits
assert_in_delta 0.17012249631509135, t5_spambase_information_gain, 5e-2

In [5]:
### TEST ###
sorted_values = iris["data"].collect {|r| r["features"]["petal_length"]}.uniq.sort
min_sl = sorted_values.first
max_sl = sorted_values.last

x = []
y = []
h0 = entropy(class_distribution(iris["data"]))
sorted_values.each do |t|
  x << t
  y << information_gain(h0, split_on_numeric_value(iris["data"], "petal_length", t))
end

assert_equal 43, sorted_values.size
Daru::DataFrame.new({x: x, y: y}).plot(type: :line, x: :x, y: :y) do |plot, diagram|
  plot.x_label "Petal Length Threshold"
  plot.y_label "Information Gain"
  plot.xrange [min_sl,max_sl]
end

NoMethodError: undefined method `class_distribution' for main:Object

## Question 6 (10 points)
Finding the best split value by calling ```split_on_numeric_value``` on each threshold is _very_ expensive. There is a better solution. The function should return an array consisting of the best threshold and the information gain of that threshold.


In order to find the best split, we need to find $t^\star$ defined as follows:

$ t^\star = \arg \max_{t} IG(x,V(t)) $

Given that a threshold is a single real value, we can simply calculate the values $IG(x,V)$ for every value of $t$ _in order_, which leads allows us to save some work by calculating information incrementally. Notice that the split sets have a specialized form, which we can write as $V(t) = \left\{v_L(t), v_R(t)\right\}$ with $v_L(t) = \left\{x \mid x < t \right\}$ and $v_R(t) = \left\{x \mid x \ge t \right\}$. 

As we sweep $t$ from minimum to maximum values, we note the following relations:

$v_R(t + \Delta_t) = v_R(t) - v_\Delta$

$v_L(t + \Delta_t) = v_L(t) \cup v_\Delta$

where $v_\Delta = \left\{x \mid t < x \le t + \Delta_t \right\}$

What we add to $v_L$ is exactly what was removed from $v_R$, which means that we can easily recalculate the information gain by incrementing a class-specific counter for $v_L$ and decrementing a class-specific counter for $v_R$ as we sweep through the range of the threshold.


In [15]:
def find_split_point_numeric x, h0, fname
  # BEGIN YOUR CODE
  values = x.collect {|data| data["features"][fname]}.compact.uniq.sort
  sValue = [0, 0]
  
  split = x.group_by do |data|
    if data["features"][fname] != nil
      data["features"][fname] < 0 ? "l" : "r"
    end
  end
         
  split["l"] = []
  
  for v in values
    ig = information_gain h0, split
    if ig > sValue[1]
      sValue[0] = v
      sValue[1] = ig
    end
    
    delta = []
    
    x.each do |data|
      if data["features"][fname].to_f == v
        delta << data
      end
    end
    
    split["l"] += delta
    split["r"] -= delta
  end
  return sValue
  #END YOUR CODE
end

:find_split_point_numeric

In [None]:
### TEST ###
t6_iris_entropy = entropy(class_distribution(iris["data"]))
t6_split_sepal_width = find_split_point_numeric iris["data"], t6_iris_entropy, "sepal_width"
assert_in_delta 3.4, t6_split_sepal_width[0], 1e-2

In [None]:
### TEST ###
assert_in_delta 0.18570201019349364, t6_split_sepal_width[1], 1e-2

In [None]:
### TEST ###
t6_spambase_entropy = entropy(class_distribution(spambase["data"]))
t6_split_dollar = find_split_point_numeric spambase["data"], t6_spambase_entropy, "char_freq_$"
assert_in_delta 0.056, t6_split_dollar[0], 1e-2

In [None]:
### TEST ###
assert_in_delta 0.17012249631509135, t6_split_dollar[1], 1e-2

In [None]:
### TEST: Produces same result as split_on_numeric_value ###
t6_iris_expected_ig = information_gain(t6_iris_entropy, split_on_numeric_value(iris["data"], "sepal_width", 3.4))
assert_in_delta t6_iris_expected_ig, t6_split_sepal_width[1], 1e-2
t6_spambase_expected_ig = information_gain(t6_spambase_entropy, split_on_numeric_value(spambase["data"], "char_freq_$", 0.056))
assert_in_delta t6_spambase_expected_ig, t6_split_dollar[1], 1e-2