mlandauer / openaustralia-parser forked from bruno/openaustralia-parser

Parser component for Open Australia

This URL has Read+Write access

openaustralia-parser / regression-test / regression_test_parse_speeches.rb
100755 130 lines (106 sloc) 4.109 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env ruby
#
# Simple implementation of regression tests for xml generated by parse-speeches.rb
# N.B. Need to pre-populate reference xml files with those that have previously been generated.
# In other words, this is only useful for checking that any refactoring has not caused a regression in behaviour.
#
 
$:.unshift "#{File.dirname(__FILE__)}/../lib"
 
require 'people'
require 'hansard_parser'
require 'configuration'
 
# Range of dates to test
 
from_date = Date.new(2007, 1, 1)
to_date = Date.new(2008, 1, 1) - 1
 
# Number of items to skip at the beginning
skip = 0
 
# Dates to test first before anything else
# Update this list with any dates that have shown up problems in the past
 
test_first = [Date.new(2007,8,8), Date.new(2007,8,14), Date.new(2007,5,8), Date.new(2007,2,14)]
 
skip_dates = []
 
#
 
conf = Configuration.new
 
# First load people back in so that we can look up member id's
people = PeopleCSVReader.read_members
 
parser = HansardParser.new(people)
 
def compare_xml(ref_path, test_path, date, count)
  if File.exists?(ref_path) && File.exists?(test_path)
    command = "diff -q #{test_path} #{ref_path}"
    puts command
    system(command)
    if $? != 0
      test = "regression_failed_text.xml"
      ref = "regression_failed_ref.xml"
      system("tidy -xml -utf8 -o #{test} #{test_path}")
      system("tidy -xml -utf8 -o #{ref} #{ref_path}")
      system("opendiff #{test} #{ref}")
      puts "ERROR: #{test_path} and #{ref_path} don't match"
      puts "Regression tests FAILED on date #{date} at count #{count}!"
      # Give the user the option to overwrite the reference file and continue
      puts "Press return to exit or 'o' to overwrite reference file and continue"
      if gets == "o\n"
        system("cp #{test_path} #{ref_path}")
      else
        exit
      end
    end
  elsif File.exists?(ref_path)
    puts "ERROR: #{test_path} is missing"
    puts "Regression tests FAILED on date #{date} at count #{count}!"
    exit
  elsif File.exists?(test_path)
    puts "ERROR: #{ref_path} is missing"
    puts "Regression tests FAILED on date #{date} at count #{count}!"
    exit
  end
end
 
def test_date(date, conf, parser, count)
  reps_xml_filename = "debates#{date}.xml"
  senate_xml_filename = "daylord#{date}.xml"
  new_reps_xml_path = "#{conf.xml_path}/scrapedxml/debates/#{reps_xml_filename}"
  new_senate_xml_path = "#{conf.xml_path}/scrapedxml/lordspages/#{senate_xml_filename}"
  ref_reps_xml_path = "#{File.dirname(__FILE__)}/../../ref/#{reps_xml_filename}"
  ref_senate_xml_path = "#{File.dirname(__FILE__)}/../../ref/#{senate_xml_filename}"
  parser.parse_date_house(date, new_reps_xml_path, House.representatives)
  compare_xml(ref_reps_xml_path, new_reps_xml_path, date, count)
  parser.parse_date_house(date, new_senate_xml_path, House.senate)
  compare_xml(ref_senate_xml_path, new_senate_xml_path, date, count)
end
 
class Array
  def randomly_permute
    temp = clone
    result = []
    (1..size).each do
      i = Kernel.rand(temp.size)
      result << temp[i]
      temp.delete_at(i)
    end
    result
  end
end
 
# Randomly permute array. This means that we will cover a much broader range of dates quickly
srand(42)
dates = (from_date..to_date).to_a.randomly_permute
 
test_first.each do |date|
  # Moves date to the beginning of the array
  dates.delete(date)
  dates.unshift(date)
end
 
skip_dates.each { |date| dates.delete(date) }
 
count = skip
time0 = Time.new
dates[skip..-1].each do |date|
  test_date(date, conf, parser, count)
  count = count + 1
  puts "Regression test progress: Done #{count}/#{dates.size}"
  seconds_left = ((Time.new - time0) / (count - skip) * (dates.size - count)).to_i
  
  minutes_left = (seconds_left / 60).to_i
  seconds_left = seconds_left - 60 * minutes_left
  
  hours_left = (minutes_left / 60).to_i
  minutes_left = minutes_left - 60 * hours_left
  
  if hours_left > 0
    puts "Estimated time left to completion: #{hours_left} hours #{minutes_left} mins"
  else
    puts "Estimated time left to completion: #{minutes_left} mins #{seconds_left} secs"
  end
end
 
puts "Regression tests all passed!"