# CAO Points Analysis
## Acquiring the data
***

### 2021

In [1]:
# import required libraries
import requests as rq             # Convenient HTTP requests.
import re                         # Regular expressions.
import datetime as dt    
import pandas as pd
import numpy as np

In [2]:
# Make a get request to 2021 CAO points URL
response = rq.get('http://www2.cao.ie/points/l8.php')

# Check that value = 200 (in which case a connection is made).
response

<Response [200]>

In [3]:
# Get the current date and time in desired string format
nowstr = dt.datetime.now().strftime('%Y%m%d_%H%M%S')

<br>

**Error on server** <br>
The Server indicated one should decode as per: Content-Type: text/html; charset=iso-8859-1. However, one line of the html uses<br> 
\x96 which isn't defined in iso-8859-1. Therefore the similar decoding standard cp1252 will be  used. This standard is very similar but<br>
includes \x96.

In [4]:
# Specify encoding method as cp1252
response.encoding = 'cp1252'

In [5]:

# Compile the regular expression for matching lines.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)(#?)(?:([AQ]{3})|([V]{1})|([0-9]{3}))(\*?) *')


# Regular Expression breakdown:          Explanation for what characters will be parsed:    

# ([A-Z]{2}[0-9]{3})                     first 2 chars (A-Z), next 3 chars, 0-9 followed by two spaces
# (.*)                                   Matches any single character except newline zero or more times
# (?:([AQ]{3})|([V]{1})|([0-9]{3}))      Match (3 x [AQ]) or (1 x V) or (3 x [0-9]) zero or one time
# (\*?) *'                              \(escaped) "*" so dont treat it as quantifier, treat it by itself. Zero or one * character "?" 
# space *                               any number of spaces


In [6]:

def fetch_2021_CAO_data(path):
    
    no_lines = 0                                        # Keep track of courses counted in loop
    with open(path, 'w') as f:                          # Open the csv file for writing.          
        for line in response.iter_lines():                      # Loop through lines of the response 
            dline = line.decode('cp1252')                       # decode the line 
            if re_course.fullmatch(dline):                      # Match only the lines representing courses.       
                no_lines = no_lines + 1                             # Add one to the lines counter if match occured      
                dline_adj = re.sub(" ", "  ", dline, count=1)       # Substitute first space character with double space     
                linesplit = re.split('   +', dline_adj)             # Split the line on three or more spaces.  
                f.write(','.join(linesplit) + '\n')                 # Rejoin the substrings with commas in between.       
    
    print(f"Total number of lines is {no_lines}.")     # Print the total number of processed lines
    

write_path = 'data/cao2021_csv_' + nowstr + '.csv'          
fetch_2021_CAO_data(write_path)

Total number of lines is 923.


### 2020

In [7]:
def data_handler_2020():
    
    # read in excel 2020 data 
    read_url = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'        
    df = pd.read_excel(read_url, skiprows = 10)                                
    
    df.insert(0, 'YEAR', 2020) 
    df = df.loc[df.loc[:, 'LEVEL'] == 8]                                                   # re-define df with level 8 courses filtered   
    df.drop(['avp', 'v', 'LEVEL'], axis = 1, inplace = True)                               # drop 'avp' and 'v' columns as these are not in 2019/2021 data
    df = df.iloc[:,:-8]                                                                    # drop last 8 empty cols dropped 
       
    
    # define col order list
    new_col_order = ['YEAR', 'CATEGORY (i.e.ISCED description)', 'HEI', 'COURSE TITLE',              
                     'COURSE CODE2','R1 POINTS', 'R2 POINTS', 'EOS Mid-point', 'EOS', 
                     'R1 Random *',  'R2 Random*', 'EOS Random *', 'Test/Interview #']  
    # re-order columns
    df = df.reindex(columns = new_col_order)                                                
                       
    # re-name columns from dict key to dict value 
    df.rename(columns = {'CATEGORY (i.e.ISCED description)': 'CATEGORY', 'Hei': 'HEI', 
                        'COURSE CODE2': 'COURSE CODE', 'EOS': 'FINAL SEASON POINTS',
                         'EOS Mid-point': 'MID SEASON POINTS', 'EOS Random *': 
                         'FINAL SEASON POINTS RANDOM*'}, inplace = True)      

    
    col_str_replace = ['R1 POINTS', 'R2 POINTS', 'MID SEASON POINTS', 'FINAL SEASON POINTS']              # define list cols for string replace operation 
    df[col_str_replace] = np.where(df[col_str_replace] == '#+matric', 'NaN', df[col_str_replace])         # replace '#+matric' with 'NaN'      
   
    #back_up_file
    df.to_csv('data/cao_2020_data.csv')
    return df



data_2020 = data_handler_2020()
data_2020

Unnamed: 0,YEAR,CATEGORY,HEI,COURSE TITLE,COURSE CODE,R1 POINTS,R2 POINTS,MID SEASON POINTS,FINAL SEASON POINTS,R1 Random *,R2 Random*,FINAL SEASON POINTS RANDOM*,Test/Interview #
0,2020,Business and administration,American College,International Business,AC120,209,,280,209,,,,
1,2020,Humanities (except languages),American College,Liberal Arts,AC137,252,,270,252,,,,
2,2020,Arts,National College of Art and Design,"First Year Art & Design (Common Entry,portfolio)",AD101,,,,,,,,#
3,2020,Arts,National College of Art and Design,Graphic Design and Moving Image Design (portfo...,AD102,,,,,,,,#
4,2020,Arts,National College of Art and Design,Textile & Surface Design and Jewellery & Objec...,AD103,,,,,,,,#
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,2020,Arts,Waterford Institute of Technology,Arts (options),WD200,AQA,AQA,336,AQA,,,,
1460,2020,Information and Communication Technologies (ICTs),Waterford Institute of Technology,Software Systems Development,WD210,279,,337,279,,,,
1461,2020,Information and Communication Technologies (ICTs),Waterford Institute of Technology,Creative Computing,WD211,271,,318,271,,,,
1462,2020,Personal services,Waterford Institute of Technology,Recreation and Sport Management,WD212,270,,349,270,,,,


### 2019

In [8]:
import urllib3
import pdfplumber
import io


def scrape_pdf_from_website(url):
    http = urllib3.PoolManager()                                   # instansiate pool manager object for sending requests
    temp_binary = io.BytesIO()                                     # create instance for storing binary data
    temp_binary.write(http.request("GET", url).data)               # request url data and write to temp_binary storage
    
    data_final = ''  
    with pdfplumber.open(temp_binary) as pdf:                      # open storage object      
        for page in pdf.pages:                                         # iterate pages of pdf object
            data = page.extract_text()                                 # extract text of page
            data_final += data                                         # concatenates text to data_final variable
    return data_final

# request url for raw pdf
url  = 'http://www2.cao.ie/points/lvl8_19.pdf'
pdf_text = scrape_pdf_from_website(url)

In [9]:
def data_handler_2019(regex, line_splitter, text):
    
    df_data = []                                                   # placeholder to add list of rows to
    #print(text)
    #for char in text:                                         # iterate file chars  
    for match in re.finditer(regex, text):                    # iterate matches each line
        course_code = match.group()[0:6]                           # isolate course code
        rest_of_line =  match.group()[6:]                          # isolate all of line after course code 
        rest_of_line = re.split(line_splitter ,rest_of_line)           # split lines using passed criteria
        df_data.append([course_code] + rest_of_line)
                        
    df = pd.DataFrame(df_data)                                # convert array to dataframe
    return df


# regex statement (will isolate all lines starting with course code)
regex = re.compile(r'([A-Z]{2}[0-9]{3})(.*)')
line_split_conditions = '(#\d{3}\* |#\d{2}\* |\d{3}\* |\d{2}\* |\d{3} |\d{2} |#\d{3} |#\d{2} |# \+matric|#)'

df_2019 = data_handler_2019(regex, line_split_conditions, pdf_text)
df_2019.to_csv('data/cao_2019_final.csv')

print('NB: it was verified as of 26/11/2021 that there were {} courses exactly in the CAO 2019 points list.'
      .format(len(df_2019.index)))


NB: it was verified as of 26/11/2021 that there were 930 courses exactly in the CAO 2019 points list.


In [10]:
# Spot check a random row.
print(df_2019.iloc[50])

# Spot chcek last row.
print(df_2019.iloc[-1])

0        CW708 
1    Law - LLB 
2          298 
3           328
Name: 50, dtype: object
0                                       WD230 
1    Mechanical and Manufacturing Engineering 
2                                         273 
3                                          348
Name: 929, dtype: object
