public
Description: Scrapes the official golf world ranking and outputs it in a more usable format
Homepage: http://holtsblog.blogspot.com/2008/10/open-source-ogwr-scraper.html
Clone URL: git://github.com/holtonma/ogwr.git
ogwr / ogwr.rb
100644 105 lines (85 sloc) 3.06 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env ruby -w
# created by Mark Holton (holtonma@gmail.com)
# copy as much as you want to
# 10-29-2008
# purpose: scrape the official world golf ranking, and present it in a more usable form (Array of ostruct's)
# using Hpricot, open-uri
 
require 'rubygems'
require 'hpricot'
require 'nokogiri'
require 'open-uri'
require 'ostruct'
 
module OGWR
  VERSION = '0.1.0'
 
  class Fetcher
    def fetch(ogwr_url, page_num)
      #page_num indicates range of 50n: page 1 >> 1-50, page 2 >> 51-100, page 3 >> 101-150...
      data = Hpricot(open(ogwr_url)).search("table:nth-child(5)")
      players = [] #init
      start_rank = -1 + 50*(page_num-1) #(there are 2 empty trs to start the world ranking) #instead of a counter, I should extract this from page
      (data/"td:nth-child(2)").each do |x|
        playa = OpenStruct.new
        playa.fname = x.search("a").inner_html.split(" ").first
        playa.lname = x.search("a").inner_html.split(" ")[1]
        playa.rank = start_rank
        #puts "#{playa.fname} #{playa.lname} #{playa.rank}"
        players << playa
        start_rank += 1
      end
      #clean this line up:
      players.pop; players.reverse!; players.pop; players.pop; players.reverse! #remove one extra row at end, 2 extra at beginning
 
      players
    end
    
    def fetch_via_noko(ogwr_url, page_num)
      doc = Nokogiri::HTML(open(ogwr_url))
      
      player_data = []
      cells = []
      
      doc.css("table").each do |table|
        if table.attributes['title'] == 'Click on player names to be taken to their individual tournaments page'
          table.css('tr').each do |row|
            row.css('td').each do |cel|
              innertext = cel.inner_text.strip()
              next unless innertext.length > 0
              #puts innertext
              cells << innertext
            end
            #puts cells
            player_data << cells
            cells = []
          end
          #puts "about to export player_data: "
        end
        #player_data
      end
      
      player_data
    end
    
    def friendly_structure player_data
      # take player_data and turn it into array of Ostructs
      players = []
      player_data.each do |p|
        #puts "p.class: #{p.class}"
        next unless (p.length > 0 && p[0] != "Rank")
        playa = OpenStruct.new
        # puts "element: #{p}"
        # puts "name: #{p[1]}"
        playa.rank = p[0]
        playa.fname = p[1].split(" ")[0]
        # puts "first name: #{p[1].split(" ")[0]}"
        # puts "last name: #{p[1].split(" ")[1]}"
        playa.lname = p[1].split(" ")[1]
        # puts "Pts Avg: #{p[2]}"
        playa.avg_pts = p[2]
        # puts "tot pts: #{p[3]}"
        playa.tot_pts = p[3]
        # puts "num events: #{p[4]}"
        playa.num_events = p[4]
        # puts "pts lost 2006-07: #{p[5]}"
        playa.pts_lost_last_year = p[5]
        # puts "pts gained 2008: #{p[6]}"
        playa.pts_gained_this_year = p[6]
        players << playa
      end
      
      players
    end
    
    
  end
  
end