Permalink
Switch branches/tags
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 451 lines (343 sloc) 11.7 KB
#!/usr/bin/ruby
#
# Copyright (c) 2015 netnea, AG. (https://www.netnea.com/)
#
# Perform the binning process on a list of values.
#
# Binning is a way to group a number of more or less continuous values into
# a smaller number of "bins". For example, if you have data about a group of
# people, you might want to arrange their ages into a smaller number of age
# intervals.
#
#
# FIXME: implement decimalplaces (beginning is here, but not finished)
#
# bug: cat labor-07-example-access.log | alduration | do-binning.rb --label -n 25 --min 0 --max 2500000.0
# final additional line should be removed
#
# -----------------------------------------------------------
# INIT
# -----------------------------------------------------------
require "optparse"
require "getoptlong"
require 'pp'
$params = Hash.new
$params[:verbose] = false
$params[:debug] = false
$params[:num_bins] = 20
$params[:num_bins_string] = ""
$params[:min] = nil
$params[:max] = nil
$params[:max_str] = ""
$params[:labels] = false
$params[:do_boundaries] = false # run in boundaries mode. That means boundaries of bins are passed on command line
$params[:boundaries_str] = ""
$params[:decimalplaces] = 1 # number of decimal places after dot
values = Array.new()
bins = Array.new()
# bin is an array with three sub-items:
# 0: number of bin
# 1: min of bin
# 2: number of occurences of value (0 by default)
# -----------------------------------------------------------
# SUB-FUNCTIONS (those that are specific to this script)
# -----------------------------------------------------------
# -----------------------------------------------------------
# GENERIC SUB-FUNCTIONS (those that come with every script)
# -----------------------------------------------------------
def dump_parameters(params)
# Purpose: Display parameters
# Input : Parameter Hash
# Output : Dump parameters to stdout
# Return : none
# Remarks: none
puts "Paramter overview"
puts "-----------------"
puts "verbose : #{params[:verbose]}"
end
def vprint(text)
# Purpose: output text if global variable $params[:verbose] is set.
# Input : String input
# Output : stdout
# Return : none
# Remarks: none
if $params[:verbose]
puts text + "\n"
end
end
def dprint(text)
# Purpose: output text if global variable $params[:debug] is set.
# Input : String input
# Output : stdout
# Return : none
# Remarks: none
if $params[:debug]
puts text + "\n"
end
end
def check_stdin ()
# Purpose: Check for access to STDIN
# Input : none
# Output : none
# Return : bool
# Remarks: none
if STDIN.tty?
# no stdin
return false
else
# stdin
return true
end
end
def check_parameters()
# Purpose: check parameters
# Input : global variable params
# Output : stderr in case there is a problem with one of the parameters
# Return : true if there is an error with one of the parameters; or false in absence of errors
# Remarks: None
err_status = false
# unless /^foo$/.match($params["x"])
# $stderr.puts "Error in parameter x ..."
# err_status = true
# end
return err_status
end
def puts_error(msg, detail)
# Purpose: Print error message
# Input : string msg and detail exception object
# Output : $stderr
# Return : None
# Remarks: There is a ruby exception class hierarchy.
# See http://makandracards.com/makandra/4851-ruby-exception-class-hierarchy
err_status = false
$stderr.puts msg
$stderr.puts "Error: #{detail.message}" if detail
$stderr.puts "Backtrace:" if detail
$stderr.puts detail.backtrace.join("\n") if detail
$stderr.puts "--------------------------"
end
# -----------------------------------------------------------
# COMMAND LINE PARAMETER EXTRACTION
# -----------------------------------------------------------
begin
parser = OptionParser.new do|opts|
opts.banner = <<EOF
Perform the binning process on a list of numerical values.
Binning is a way to group a number of more or less continuous values into
a smaller number of "bins". For example, if you have data about a group of
people, you might want to arrange their ages into a smaller number of age
intervals like 0-19,20-29,30-39,...
Usage: STDIN | #{__FILE__} [options]
EOF
opts.banner.gsub!(/^\t/, "")
opts.separator ""
opts.separator "Options:"
opts.on('-d', '--debug', 'Display debugging infos') do |none|
$params[:debug] = true
end
opts.on('-D', '--decimalplaces MAN', "Number of decimal places after dot. Default is #{$params[:decimalplaces]}.") do |man|
$params[:decimalplaces] = man.to_i
end
opts.on('-v', '--verbose', 'Be verbose') do |none|
$params[:verbose] = true
end
opts.on('-b', '--boundaries MAN', 'Pass boundaries of bins on the command line, i.e. "5,10,15,20".') do |man|
$params[:boundaries_str] = man
$params[:do_boundaries] = true
end
opts.on('-m', '--min MAN', 'Minimum value. When working with boundaries, this option may not be passed.', 'The lowest boundary is automatically the min value.') do |man|
$params[:min] = man.to_f
end
opts.on('-M', '--max MAN', 'Maximum value. When working with boundaries this option can be set.') do |man|
$params[:max] = man.to_f
$params[:max_str] = man
end
opts.on('-l', '--labels', 'Print bin sizes with labels. By default, this is off.') do |none|
$params[:labels] = true
end
opts.on('-n', '--numbins MAN', 'Number of bins to be created.', 'You can not set this option if you also set boundaries option.', "Default is #{$params[:num_bins]}.") do |man|
$params[:num_bins_string] = man
$params[:num_bins] = man.to_i
end
opts.on('-h', '--help', 'Displays Help') do
puts opts
exit
end
# Usage notes (to be printed in help text after cli options)
notes = <<EOF
Notes:
You can either pass the number of bin you want to fill, or you pass the
boundaries of the bins yourself. If you pass the number of bins with
numbins, then you can not pass boundaries.
Boundaries need not be of equal size. It is ok to call with boundaries
value of 0-19,20-29,30-39, etc. Boundaries can be integers or floating
point numbers. Negative values are OK too.
Boundaries define the bin. The first boundary is automatically the min
value that will be considered. So passing a min value on the command line
is no accepted. The final boundary defines the min value of the final bin.
It is therefore acceptable to define a separate max value or to leave the
max value open and let the final bin stretch to infinity.
EOF
notes.gsub!(/^\t/, "")
opts.on_tail(notes)
end
parser.parse!
#rescue OptionParser::InvalidOption => detail
# puts_error("Invalid Option in command line parameter extraction. This is fatal. Aborting.", detail)
# exit 1
#rescue => detail
# puts_error("Unknown error in command line parameter extraction. This is fatal. Aborting.", detail)
# exit 1
end
if $params[:do_boundaries] and $params[:num_bins_string] != ""
$stderr.puts "Boundaries and numbins passed together. Please pick one of the two. Aborting."
exit 1
end
if $params[:do_boundaries] and not $params[:min].nil?
$stderr.puts "Boundaries and min value passed. Lowest boundary is mean to be min value. Please omitt min value. Aborting."
exit 1
end
if /^[0-9,.-]*$/.match($params[:boundaries_str]).nil?
$stderr.puts "Boundaries passed can not be read. This is fatal. Aborting."
exit 1
end
if /^[0-9]$/.match($params[:decimalplaces].to_s).nil?
$stderr.puts "Decimal places passed is not an integer number <= 9. This is fatal. Aborting."
exit 1
end
unless check_stdin
$stderr.puts "No STDIN found. Please pass STDIN to script."
exit 1
end
# ----------------------------------
# MAIN
# ----------------------------------
STDIN.each do |line|
values << line.chomp.to_f
end
if ( $params[:do_boundaries] )
boundaries_str_array = $params[:boundaries_str].split(",")
boundaries_array = Array.new
boundaries_str_array.each do |item|
boundaries_array << item.to_f
end
if boundaries_array.length <= 1
$stderr.puts "Boundaries passed can not be interpreted. Did you pass no real boundary or only a single one? Aborting."
exit 1
end
boundaries_array.sort!
0.upto(boundaries_array.length-1) do |i|
bins[i] = [i, boundaries_array[i], 0]
end
# check boundaries and compatibility with min / max
$params[:min] = boundaries_array[0]
if ( not $params[:max].nil? )
if boundaries_array[boundaries_array.length-1] > $params[:max]
$stderr.puts "Last boundary is higher than max. This is fatal. Aborting."
exit 1
end
end
# How we perform the binning
# - sort values
# - start with first bin
# - loop over values
# - if value fits into bin, add 1 to size of bin
# - if value does not fit into bin, move to next bin
# - done
values.sort!
i = 0
boundary_next = bins[i + 1][1]
values.each do |item|
if item < $params[:min]
next
end
unless $params[:max].nil?
if item > $params[:max]
break
end
end
infinity_bin = false
while (item >= boundary_next and not infinity_bin)
i = i + 1
if i >= bins.length - 1 # reached top bin. can't calculate boundary_next
infinity_bin = true
else
# puts "#{i} #{item} #{bins.length}"
boundary_next = bins[i + 1][1]
end
end
if i >= bins.length - 1 # it is a rare case, which leads to an error if this clause is commented out
# echo -e "10\n173759\n10000000000000" | do-binning.rb -b 1000,50000,100000 --labels
i = bins.length - 1
end
bins[i][2] += 1 # raise number of occ. of this bin
end
$params[:num_bins] = bins.length
else
boundaries_array = Array.new
# initialize empty bins array
$params[:min] = values.min if $params[:min].nil?
$params[:max] = values.max if $params[:max].nil?
step = ($params[:max] - $params[:min]) / $params[:num_bins]
0.upto($params[:num_bins]) do |i|
boundary = $params[:min] + i * step
boundary = (boundary * 10 ** $params[:decimalplaces]).round / (10 ** $params[:decimalplaces]).to_f
boundaries_array << boundary
end
0.upto(boundaries_array.length-1) do |i|
bins[i] = [i, boundaries_array[i], 0]
end
# check boundaries and compatibility with min / max
$params[:min] = boundaries_array[0]
if ( not $params[:max].nil? )
if boundaries_array[boundaries_array.length-1] > $params[:max]
$stderr.puts "Last boundary is higher than max. This is fatal. Aborting."
exit 1
end
end
values.sort!
i = 0
boundary_next = bins[i + 1][1]
values.each do |item|
if item < $params[:min]
next
end
unless $params[:max].nil?
if item > $params[:max]
break
end
end
infinity_bin = false
while (item >= boundary_next and not infinity_bin)
i = i + 1
if i >= bins.length - 1 # reached top bin. can't calculate boundary_next
infinity_bin = true
else
# puts "#{i} #{item} #{bins.length}"
boundary_next = bins[i + 1][1]
end
end
if i >= bins.length - 1 # it is a rare case, which leads to an error if this clause is commented out
# echo -e "10\n173759\n10000000000000" | do-binning.rb -b 1000,50000,100000 --labels
i = bins.length - 1
end
bins[i][2] += 1 # raise number of occ. of this bin
end
$params[:num_bins] = bins.length
end
# pp bins
0.upto($params[:num_bins] - 1) do |n|
unless $params[:labels]
puts "#{bins[n][2]}"
else
if n == $params[:num_bins] - 1
if $params[:max_str] == ""
$params[:max_str] = "infinity"
end
puts "#{bins[n][1]}-#{$params[:max_str]} #{bins[n][2]}"
else
puts "#{bins[n][1]}-#{bins[n+1][1]} #{bins[n][2]}"
end
end
n = n + 1
end