# CAO Points Analysis
***

In [1]:
# import required libraries
import requests as rq             # Convenient HTTP requests.
import re                         # Regular expressions.
import datetime as dt              

In [2]:
# Make a get request to 2021 CAO points URL
response = rq.get('http://www2.cao.ie/points/l8.php')

# Check that value = 200 (in which case a connection is made).
response

<Response [200]>

In [3]:
# Get the current date and time in desired string format
nowstr = dt.datetime.now().strftime('%Y%m%d_%H%M%S')

<br>

**Error on server** <br>
The Server indicated one should decode as per: Content-Type: text/html; charset=iso-8859-1. However, one line of the html uses<br> 
\x96 which isn't defined in iso-8859-1. Therefore the similar decoding standard cp1252 will be  used. This standard is very similar but<br>
includes \x96.

In [5]:
# Specify encoding method as cp1252
response.encoding = 'cp1252'

In [5]:

# Compile the regular expression for matching lines.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)(#?)(?:([AQ]{3})|([V]{1})|([0-9]{3}))(\*?) *')


# Regular Expression breakdown:          Explanation for what characters will be parsed:    

# ([A-Z]{2}[0-9]{3})                     first 2 chars (A-Z), next 3 chars, 0-9 followed by two spaces
# (.*)                                   Matches any single character except newline zero or more times
# (?:([AQ]{3})|([V]{1})|([0-9]{3}))      Match (3 x [AQ]) or (1 x V) or (3 x [0-9]) zero or one time
# (\*?) *'                              \(escaped) "*" so dont treat it as quantifier, treat it by itself. Zero or one * character "?" 
# space *                               any number of spaces


In [6]:

def fetch_2021_CAO_data(path):
    
    no_lines = 0                                        # Keep track of courses counted in loop
    with open(path, 'w') as f:                          # Open the csv file for writing.          
        for line in resp.iter_lines():                      # Loop through lines of the response 
            dline = line.decode('cp1252')                       # decode the line 
            if re_course.fullmatch(dline):                      # Match only the lines representing courses.       
                no_lines = no_lines + 1                             # Add one to the lines counter if match occured      
                dline_adj = re.sub(" ", "  ", dline, count=1)       # Substitute first space character with double space     
                linesplit = re.split('   +', dline_adj)             # Split the line on three or more spaces.  
                f.write(','.join(linesplit) + '\n')                 # Rejoin the substrings with commas in between.       
    
    print(f"Total number of lines is {no_lines}.")     # Print the total number of processed lines
    

write_path = 'data/cao2021_csv_' + nowstr + '.csv'          
fetch_2021_CAO_data(write_path)

Total number of lines is 923.


In [10]:
# get data in  correct format 