# CAO Points Analysis
***

In [1]:
# import required libraries
import requests as rq             # Convenient HTTP requests.
import re                         # Regular expressions.
import datetime as dt   

# Get the current date and time in desired string format
nowstr = dt.datetime.now().strftime('%Y%m%d_%H%M%S')

<br>

**Error on server (reference function below)** <br>
The Server indicated one should decode as per: Content-Type: text/html; charset=iso-8859-1. However, one line of the html uses<br> 
\x96 which isn't defined in iso-8859-1. Therefore the similar decoding standard cp1252 will be  used. This standard is very similar but<br>
includes \x96.

In [2]:

def parse_and_write_2021_data(url, re_course):
    
    response = rq.get(url)                                  # Make a get request to 2021 CAO points URL
    response.encoding = 'cp1252'                            # Specify encoding method as cp1252 despir what server recommended
    
    no_lines = 0                                            # Keep track of courses counted in loop
    with open('data/cao2021.csv', 'w') as f:                              # Open the csv file for writing         
        for line in response.iter_lines():                          # Loop through lines of the response 
            dline = line.decode('cp1252')                           # decode the line 
            if re_course.fullmatch(dline):                          # Match only the lines representing courses.       
                no_lines = no_lines + 1                                 # Add one to the lines counter if match occured      
                dline_adj = re.sub(" ", "  ", dline, count = 1)         # Substitute first space character with double space     
                linesplit = re.split('   +', dline_adj)                 # Split the line on three or more spaces.  
                f.write(','.join(linesplit) + '\n')                     # Rejoin the substrings with commas in between.         
    print(f"Total number of lines is {no_lines}.")         # Print the total number of processed lines
    


In [6]:

# Regular Expression breakdown:                Explanation for what characters will be parsed:    
#**************************************        ***************************************************************************************************
# ([A-Z]{2}[0-9]{3})                           first 2 chars (A-Z), next 3 chars, 0-9 followed by two spaces
# (.*)                                         Matches any single character except newline zero or more times
# (?:([AQ]{3})|([V]{1})|([0-9]{3}))            Match (3 x [AQ]) or (1 x V) or (3 x [0-9]) zero or one time
# (\*?) *'                                    \(escaped) "*" so dont treat it as quantifier, treat it by itself. Zero or one * character "?" 
# space *                                     any number of spaces

response_url = 'http://www2.cao.ie/points/l8.php'                                                   # Make a get request to 2021 CAO points URL
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)(#?)(?:([AQ]{3})|([V]{1})|([0-9]{3}))(\*?) *')     # define regular expression
parse_and_write_2021_data(response_url, re_course)                                                  # pass paremeters into parse & write function


Total number of lines is 923.


In [4]:
# get data in  correct format 
import pandas as pd
import numpy as np


def clean_2021_data():
    read_path = 'data/cao2021.csv'
    header_list_2021 = ['COURSE CODE', 'COURSE TITLE', 'R1 POINTS', 'R2 POINTS']       # declare column headers
    df = pd.read_csv(read_path, names = header_list_2021)                              # read csv file

    df.insert(0, 'Year', 2021)                                                      
    df['R1 Random*'] = np.where(df['R1 POINTS'].str.contains('\*'),'*','')              # insert year column & 
    df['R2 Random*'] = np.where(df['R2 POINTS'].str.contains('NaN'), '',                # perform replace string operations
                         np.where(df['R2 POINTS'].str.contains('\*'), '*', ''))
    
    df['R1 POINTS'].replace(regex = ['\*', '#'], value = '', inplace = True)            # remove '*' and '#' from points columns  
    df['R2 POINTS'].replace(regex = ['\*', '#'], value = '', inplace = True)
    return df
    
    
df_final_2021 = clean_2021_data()
df_final_2021.head(5)

Unnamed: 0,Year,COURSE CODE,COURSE TITLE,R1 POINTS,R2 POINTS,R1 Random*,R2 Random*
0,2021,AL801,Software Design for Virtual Reality and Gaming,300,,,
1,2021,AL802,Software Design in Artificial Intelligence for...,313,,,
2,2021,AL803,Software Design for Mobile Apps and Connected ...,350,,,
3,2021,AL805,Computer Engineering for Network Infrastructure,321,,,
4,2021,AL810,Quantity Surveying,328,,,


# 2020 data

In [5]:

def read_2020_data():
    read_url = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'      
    df = pd.read_excel(read_url, skiprows = 10)                               # read in excel 2020 data   
    df = df.iloc[:,:-8]                                                       # re-define df with last 8 cols dropped
    df = df.loc[df.loc[:, 'LEVEL'] == 8]                                      # re-define df with level 8 courses filtered
    return df

data_2020 = read_2020_data()
data_2020.head()
 

    
   


Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,LEVEL,HEI,Test/Interview #,avp,v
0,Business and administration,International Business,AC120,209,,,,209,,280,8,American College,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,8,American College,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,8,National College of Art and Design,#,,
