# CAO Points Analysis

http://www.cao.ie/index.php?page=points&p=2021

---

In [1]:
# Dates and times
import datetime as dt

# regular expressions
import re

# Convenient for HTTP requests
import requests as rq

In [2]:
# fetch the cao url
resp = rq.get('http://www2.cao.ie/points/l8.php')
# have a quick peek
resp

<Response [200]>

<br>

## Save original data set

In [3]:
# get the current date and time
now = dt.datetime.now()

# format as a string
nowstr = now.strftime('%Y%m%d_%H%M%S')

In [4]:
# Create a filepath for the original data
path = 'data/CAO2021_' + nowstr + '.html'

## Error on server
explain!

In [5]:
# the server uses the wrong encoding, fix it
original_encoding = resp.encoding
# change to cp1252
resp.encoding = 'cp1252'

In [6]:
# save the original html file
with open(path, 'w') as f:
    f.write(resp.text)

<br>

## Use regular expressions to select lines we want
___

In [7]:
# compile the regular expression for matching lines

# ([A-Z]{2}[0-9]{3}) = represents course code - i.e. CW078
# followed by 2 spaces
# (.*) represents amount of text. dot(.) = wildcard. *=zero or more of
# ([0-9]{3}) = 3 digit number (points)
# (\*) = literal asterisk(? = O or 1 of)
# (space + *) any amount of spaces
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')

### Loop through the lines of the response 
___

In [8]:
#clean all up!

# path for csv file
path = 'data/CAO2021_csv_' + nowstr + '.csv'

# keep track of courses
no_lines = 0

with open(path, 'w') as f:
    for line in resp.iter_lines():
        dline = line.decode('cp1252')
        # match only the lines we want - ones representing courses
        if re_course.fullmatch(dline):
            # add to line counter
            no_lines = no_lines + 1
            csv_version = re_course.sub(r'\1,\2,\3,\4', dline)
            # split the line on 2 spaces or more
            linesplit = re.split('  +', dline)
            f.write(','.join(linesplit) + '\n')
    
print(f"Total number of lines is {no_lines}.")

Total number of lines is 922.


# End