In [1]:
import requests
url = "http://papers.cumincad.org/cgi-bin/works/Search?search=Scanner&x=0&y=0"

In [2]:
def main(url):
    """Run the other functions to turn a CumInCAD search into a json of published paper data"""
    
    # Important initialisations
    nextURL = url
    listings = []
    pageCount = 0
    
    # Loop through each page of search results starting at the specified url, gathering links to listings
    while nextURL != "lastPage":
        # Get the HTML
        html = getHTML(nextURL)
        # Split the HTML into listings and add to the list of listings
        listings.extend(getListings(html))
        # Find the link to the next page
        nextURL = getNextURL(html)
        # Count and display the amount of pages
        pageCount += 1
        print("Page number " + str(pageCount))
    
    # Extract just the URL of each listing
    listings = [getListingURL(listing) for listing in listings]
    
    # Looping through each of the listing pages to mine data
    listingCount = 0
    for listing in listings[:4]:
        #print("\n" + listing)
        listingCount += 1
        print("Listing " + str(listingCount))
        listingHTML = getHTML(listing)
        dataPoints = scrapeListingMeta(listingHTML)

In [3]:
def getNextURL(html):
    """Find the next page's URL from within the previous's HTML"""
    
    # Split the html at the point where the next page icon is used
    parts = html.split('="/woda/icons/flat-noborder/forward.gif"')
    
    # If this icon doesn't occur, return the 'lastPage' flag
    if len(parts) == 1:
        return "lastPage"
    else:
        # Getting the URL
        nextURL = "http://papers.cumincad.org" + parts[0].split("HREF")[-1].split('"')[1]
        
        # Replacing &amp; with &
        nextURL = nextURL.replace("&amp;", "&")
        
        # Returning the new URL
        return nextURL

In [4]:
def getHTML(url):
    """Get the HTML at a given URL"""
    response = requests.get(url)
    return response.text

In [5]:
def getListings(fullHTML):
    """Get the listings from the HTML"""
    
    # Cut down to the relevant <div> 
    div = fullHTML.split("<DIV CLASS=RECORDS>")[1].split("</DIV>")[0]
    
    # Get all the table rows, excluding the bit of non-table stuff at the beginning and the first header row
    table = [("<tr" + row).strip() for row in div.split("<tr")[2:]]
    
    # Get rid of a little extraneous bit at the end of the last row
    if table[-1].endswith("</tbody></table><div><br/></div>"):
        table[-1] = table[-1][:-32].strip()
    
    # Joining rows that shouldn't be seperated (there's one row inside each row)
    rowPairs = list(zip(table[::2], table[1::2]))
    rows = [pair[0] + pair[1] for pair in rowPairs]
    return rows

In [6]:
def getListingURL(listing):
    """Get the URL that corresponds to the detailed page of a listing"""
    return listing.split("<A HREF=")[1].split(">")[0]

In [7]:
def scrapeListingMeta(html):
    """Return a dictionary of all the metadata in the html of a listing's page"""
    
    # Retrieving a list of each meta point in the HTML
    metaPoints = html.split("<meta")[1:]
    metaPoints = [point.split("/>")[0] for point in metaPoints]
    
    # Looping through the points, adding the data from each string to a dictionary
    listingMeta = {}
    for point in metaPoints:
        print("POINT: " + point)
        point = point.strip()
        points = point.split('"')
        if len(points)

In [9]:
main(url)

Page number 1
Page number 2
Listing 1
POINT:  name="DC.creator" content="Baraniak, David W." 
POINT:  name="DC.date" content="1987" 
POINT:  name="DC.title" content="Automatic Data Capture: Scanners Offer a Cost-effective Solution" 
POINT:  name="DC.source" content="computer Graphics World November, 1987. vol. 10: pp. 93-94, 97 : ill." 
POINT:  name="DC.description" content=" table. In order to decide whether today&#39;s scanner deliver the price and performance a particular CAAD application demand, the author lists vendors, scanner type, raster to vector conversion editing raster vector, data exchange format and compares them" 
POINT:  name="DC.subject" content="hardware, CAD, scanning, business" 
POINT:  name="DC.type" content="Conference Paper" 
POINT:  name="DC.type" content="PeerReviewed" 
POINT:  name="DC.format" content="application/pdf" 
POINT:  name="DC.identifier" content="http://papers.cumincad.org/data/works/att/" 
POINT:  name="DC.relation" content="http://papers.cumincad.

POINT:  name="DC.creator" content="Celani, Gabriela; Laura Cancherini" 
POINT:  name="DC.date" content="2009" 
POINT:  name="DC.title" content="Digitalização tridimensional de objetos: um estudo de caso [Scanning Three-dimensional Objects: A Case Study]" 
POINT:  name="DC.source" content="SIGraDi 2009 - Proceedings of the 13th Congress of the Iberoamerican Society of Digital Graphics, Sao Paulo, Brazil, November 16-18, 2009" 
POINT:  name="DC.description" content="The present research is an exploratory study about medium-range 3D-scanning technologies for architectural applications. Its purpose was to gather information that will subside the future acquisition of a 3D-scanning equipment for the Laboratory for Automation and Prototyping for Architecture and Construction, LAPAC, at the University of Campinas (UNICAMP). In order to test some of these technologies, some experiments were carried out. Museum sculptures were digitized and the results were 3D-printed. Preliminary results show 