In [185]:
#  CAO Points Analysis

# http://www2.cao.ie/points/l8.php

In [186]:
# Regular expressions 
import re

# Convenient HTTP requests
import requests as rq

# Dates and times
import datetime as dt

In [187]:
# Fetch the CAO points URL
resp = rq.get('http://www2.cao.ie/points/l8.php')

# Check connection '<Response [200]>' means OK
resp

# Check text
# resp.text

<Response [200]>

<br>

Save orginal data set
***

In [188]:
# Get the current date and time
now = dt.datetime.now()

# format as a string
nowstr = now.strftime('%Y%m%d_%H%M%S')

print(nowstr)

20211025_181824


In [189]:
# Create a file path for the original data.
path = 'data/cao2021_' + nowstr + '.html'

<br>

Error on server
****
Technically, the server says we should decode as per:

    Content-Type: text/html; charset=iso-8859-1

However, one line uses \x96 which isn't defined in iso-8859-1.

Therefore we use the similar decoding standard cp1252, which is very similar but includes #x96.

In [190]:
# The server uses the wrong encoding, fix it.
original_encoding = resp.encoding

# Change to cp1252.
resp.encoding = 'cp1252'

In [191]:
# Save the original html file.
with open(path, 'w') as f:
    f.write(resp.text)

<br>

# Use regular expressions to select lines required
***

In [192]:
# Compile the regular expression for matching lines with courses
# Documentation for re (regular expression)
# https://docs.python.org/3/library/re.html

# Copy of cao.csv orginal copy shows circa 949 lines

# Test of orginal re = 949
# re_course  = re.compile('([A-Z]{2}[0-9]{3}).*')

# Test of new re = 922
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')

# re_course  = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)\s+(\#?([0-9]{3})\*?).*')
# re_course  = re.compile(r'([A-Z]{2}[0-9]{3})\s+(.*)\s+(\#?[0-9]{3}\*?)\s+(\#?[0-9]{3}\*?)')

<br>

Loop throught the lines of response content from CAO

Copy of cao.csv orginal copy shows circa 949 lines

***

In [193]:
# The file path for the csv file.
path = 'data/cao2021_csv_' + nowstr + '.csv'

# Keep track of how many courses we process.
no_lines = 0

# Open the csv file for writing.
with open(path, 'w') as f:
    # Loop through lines of the response.
    for line in resp.iter_lines():
        # Decode the line, using the wrong encoding!
        dline = line.decode('cp1252')
        # Match only the lines representing courses.
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # Split the line on two or more spaces.
            linesplit = re.split('  +', dline)
            # Rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 922.
