In [1]:
import requests
import json
url = "http://papers.cumincad.org/cgi-bin/works/Search?search=&x=47&y=8&max=1000"

In [2]:
def main(url, fileName):
    """Run the other functions to turn a CumInCAD search into a json of published paper data"""
    
    # Important initialisations
    nextURL = url
    listings = []
    pageCount = 0
    
    # Loop through each page of search results starting at the specified url, gathering links to listings
    while nextURL != "lastPage":
        # Get the HTML
        html = getHTML(nextURL)
        # Split the HTML into listings and add to the list of listings
        listings.extend(getListings(html))
        # Find the link to the next page
        nextURL = getNextURL(html)
        # Count and display the amount of pages
        pageCount += 1
        print("Page number " + str(pageCount))
    
    # Extract just the URL of each listing
    listings = [getListingURL(listing) for listing in listings]
    
    # Looping through each of the listing pages to mine data
    listingCount = 0
    listingsData = []
    for listing in listings:
        # Counting the listings
        listingCount += 1
        print("Listing " + str(listingCount))
        
        # Getting the HTML
        listingHTML = getHTML(listing)
        
        # Scraping the metadata from the html
        dataPoints = scrapeListingMeta(listingHTML)
        
        # Adding the data to the large collection of it
        listingsData.append(dataPoints)
    
    # Save the data as a json string
    jsonDataString = json.dumps(listingsData)
    
    # Save the string to a file
    with open(fileName + ".json", 'w') as jsonFile:
        jsonFile.write(jsonDataString)

In [3]:
def getNextURL(html):
    """Find the next page's URL from within the previous's HTML"""
    
    # Split the html at the point where the next page icon is used
    parts = html.split('="/woda/icons/flat-noborder/forward.gif"')
    
    # If this icon doesn't occur, return the 'lastPage' flag
    if len(parts) == 1:
        return "lastPage"
    else:
        # Getting the URL
        nextURL = "http://papers.cumincad.org" + parts[0].split("HREF")[-1].split('"')[1]
        
        # Replacing &amp; with &
        nextURL = nextURL.replace("&amp;", "&")
        
        # Returning the new URL
        return nextURL

In [4]:
def getHTML(url):
    """Get the HTML at a given URL"""
    response = requests.get(url)
    return response.text

In [5]:
def getListings(fullHTML):
    """Get the listings from the HTML"""
    
    # Cut down to the relevant <div> 
    div = fullHTML.split("<DIV CLASS=RECORDS>")[1].split("</DIV>")[0]
    
    # Get all the table rows, excluding the bit of non-table stuff at the beginning and the first header row
    table = [("<tr" + row).strip() for row in div.split("<tr")[2:]]
    
    # Get rid of a little extraneous bit at the end of the last row
    if table[-1].endswith("</tbody></table><div><br/></div>"):
        table[-1] = table[-1][:-32].strip()
    
    # Joining rows that shouldn't be seperated (there's one row inside each row)
    rowPairs = list(zip(table[::2], table[1::2]))
    rows = [pair[0] + pair[1] for pair in rowPairs]
    return rows

In [6]:
def getListingURL(listing):
    """Get the URL that corresponds to the detailed page of a listing"""
    return listing.split("<A HREF=")[1].split(">")[0]

In [7]:
def scrapeListingMeta(html):
    """Return a dictionary of all the metadata in the html of a listing's page"""
    
    # Retrieving a list of each meta point in the HTML
    metaPoints = html.split("<meta")[1:]
    metaPoints = [point.split("/>")[0] for point in metaPoints]
    
    # Looping through the points, adding the data from each string to a dictionary
    listingMeta = {}
    badListingMeta = {}
    for point in metaPoints:
        # Split the point up into valuable information
        point = point.strip()
        pointParts = point.split('"')
        
        # Add the parts appropriately to the dictionary
        listingMeta[pointParts[1]] = pointParts[3]
        
        # Note down the potentially bad data in a different dictionary
        if point.count('"') != 4:
            badListingMeta[pointParts[1]] = pointParts[3]
    
    
    #print("\n\nlistingMeta:")
    #[print(meta + ": " + listingMeta[meta]) for meta in listingMeta]
    #print("\nbadListingMeta:")
    #[print(meta + ": " + badListingMeta[meta]) for meta in badListingMeta]
    
    return listingMeta

In [8]:
main(url, "allSearchData")

Page number 1
Page number 2
Page number 3
Page number 4
Page number 5
Page number 6
Page number 7
Page number 8
Page number 9
Page number 10
Page number 11
Page number 12
Page number 13
Page number 14
Page number 15

Listing 1

Listing 2

Listing 3

Listing 4

Listing 5

Listing 6

Listing 7

Listing 8

Listing 9

Listing 10

Listing 11

Listing 12

Listing 13

Listing 14

Listing 15

Listing 16

Listing 17

Listing 18

Listing 19

Listing 20

Listing 21

Listing 22

Listing 23

Listing 24

Listing 25

Listing 26

Listing 27

Listing 28

Listing 29

Listing 30

Listing 31

Listing 32

Listing 33

Listing 34

Listing 35

Listing 36

Listing 37

Listing 38

Listing 39

Listing 40

Listing 41

Listing 42

Listing 43

Listing 44

Listing 45

Listing 46

Listing 47

Listing 48

Listing 49

Listing 50

Listing 51

Listing 52

Listing 53

Listing 54

Listing 55

Listing 56

Listing 57

Listing 58

Listing 59

Listing 60

Listing 61

Listing 62

Listing 63

Listing 64

Listing 65

Listing 66



KeyboardInterrupt: 