# An analysis of CAO point from 2019, 2020 and 2021

---

In [1]:
# Regular expressions
import re

# For HTTP requests
import requests as rq

# Dates and Times
import datetime as dt

# Pandas
import pandas as pd

# For saving urls
import urllib.request as urlrq

# FOr only printing top few lines of large datasets
from itertools import islice

In [2]:
# Get current date and time
now = dt.datetime.now()

# Format as string
nowstr = now.strftime('%Y%m%d_%H%M%S')

***

# 2021 CAO Points

https://www.cao.ie/index.php?page=points&p=2021

***

## Note on HTML format for 2021 CAO Points

At the time of starting this project, the CAO points for 2021 were only available in on a HTML webpage. The link above would have another link taking you to this webpage. Presently, that link now links to a .xlsx file. Although the web address for the old HTML webpage of 2021 points still works, it cannot be accessed from the main 2021 page linked to above. While the .xlsx will presumably be updated with any new points, the same cannot be said of the HTML page. I have left the code to access the HTML webpage in but for the analysis of the data, I will be using the data from the .xlsx file.

### Level 8 Courses (HTML)

In [3]:
resp = rq.get('http://www2.cao.ie/points/l8.php')

# Check if working
# Response [200] if working
resp

<Response [200]>

In [4]:
# Compile the regular expression for matching lines

# This compiler has some issues and does not get all lines needed
# A simpler compiler might be possible
# re_course = re.compile('([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')

re_course = re.compile('([A-Z]{2}[0-9]{3}.*)')



In [59]:
# Loop through line of the response content
no_lines = 0
for line in resp.iter_lines():
    dline = line.decode('cp1252')
    # Only match lines with courses / points
    if re_course.fullmatch(dline):
        no_lines +=1
        # print(line)
        csv_version = re_course.sub(r'\1', dline)
        # print(csv_version)
print(f"----------------------------------------------------------------------------- \n Total number of lines is {no_lines}.")

----------------------------------------------------------------------------- 
 Total number of lines is 416.


In [6]:
# Create file path for saving csv file
path = 'data/cao2021_l8' + nowstr + '.csv'

# Keep track of number of lines
no_lines = 0

with open(path, 'w') as f:
    
    for line in resp.iter_lines():
        dline = line.decode('cp1252')
        if re_course.fullmatch(dline):
            no_lines = no_lines + 1
            linesplit = re.split('  +', dline)
            f.write(','.join(linesplit) + '\n')
        
print(f"Total number of lines is {no_lines}.")

Total number of lines is 949.


### Level 6/7 Courses (HTML)

In [7]:
resp = rq.get('http://www2.cao.ie/points/l76.php')

# Check if working
# Response [200] if working
resp

<Response [200]>

In [8]:
# Compile the regular expression for matching lines

# This compiler has some issues and does not get all lines needed
# A simpler compiler might be possible
# re_course = re.compile('([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')

re_course = re.compile('([A-Z]{2}[0-9]{3}.*)')



In [30]:
# Loop through line of the response content
no_lines = 0
for line in resp.iter_lines():
    dline = line.decode('cp1252')
    # Only match lines with courses / points
    if re_course.fullmatch(dline):
        no_lines +=1
        # print(line)
        csv_version = re_course.sub(r'\1', dline)
        # print(csv_version)
print(f"----------------------------------------------------------------------------- \n Total number of lines is {no_lines}.")

----------------------------------------------------------------------------- 
 Total number of lines is 416.


In [10]:
# Create file path for saving csv file
path = 'data/cao2021_l76' + nowstr + '.csv'

# Keep track of number of lines
no_lines = 0

with open(path, 'w') as f:
    
    for line in resp.iter_lines():
        dline = line.decode('cp1252')
        if re_course.fullmatch(dline):
            no_lines = no_lines + 1
            linesplit = re.split('  +', dline)
            f.write(','.join(linesplit) + '\n')
        
print(f"Total number of lines is {no_lines}.")

Total number of lines is 416.


### Level 6/7/8 Courses (xlsx)

In [11]:
# Name new excel save files
path = 'data/cao2021_' + nowstr + '.xlsx'

In [12]:
# Save Excel File
urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2021.xlsx', path)

('data/cao2021_20211228_191548.xlsx',
 <http.client.HTTPMessage at 0x1fb10a74fa0>)

In [13]:
# Download and parse the excel spreadsheet
df2021 = pd.read_excel('http://www2.cao.ie/points/CAOPointsCharts2021.xlsx', skiprows=11)

In [14]:
# Save Pandas dataframe to file
path = 'data/cao2021_' + nowstr + '.csv'

df2021.to_csv(path)

In [57]:
# Filter columns that we want to see
df2021 = df2021[["Course Code", "Course Title", "EOS Points", "EOS Midpoints", "Course Level"]]

In [60]:
 columns = ["Code", "Title", "Points", "MidPoints", "Level"]

In [61]:
df2021.columns = columns

In [62]:
df2021.head()

Unnamed: 0,Code,Title,Points,MidPoints,Level
0,AL605,Music and Instrument Technology,211,319,6
1,AL630,Pharmacy Technician,308,409,6
2,AL631,Dental Nursing,311,400,6
3,AL632,Applied Science,297,454,6
4,AL650,Business,AQA,351,6


***

# 2020 CAO Points

https://www.cao.ie/index.php?page=points&p=2020

***

In [15]:
# Name new excel save files
path = 'data/cao2020_' + nowstr + '.xlsx'

In [16]:
# Save Excel File
urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', path)

('data/cao2020_20211228_191548.xlsx',
 <http.client.HTTPMessage at 0x1fb10b5c0d0>)

In [53]:
# Download and parse the excel spreadsheet
df2020 = pd.read_excel('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', skiprows=10)

In [51]:
# Save Pandas dataframe to file
path = 'data/cao2020_' + nowstr + '.csv'

df2020.to_csv(path)

In [55]:
# Filter columns that we want to see
df2020 = df2020[["COURSE CODE2", "COURSE TITLE", "EOS", "EOS Mid-point", "LEVEL"]]

In [63]:
df2020.columns = columns

In [64]:
df2020.head()

Unnamed: 0,Code,Title,Points,MidPoints,Level
0,AC120,International Business,209,280,8
1,AC137,Liberal Arts,252,270,8
2,AD101,"First Year Art & Design (Common Entry,portfolio)",#+matric,#+matric,8
3,AD102,Graphic Design and Moving Image Design (portfo...,#+matric,#+matric,8
4,AD103,Textile & Surface Design and Jewellery & Objec...,#+matric,#+matric,8


***

# 2019 CAO Points

https://www.cao.ie/index.php?page=points&p=2019

***

### Steps to reproduce

1. Download original pdf file
2. Open original pdf file in Microsoft Word
3. Save Microsoft Word's converted pdf in docx format
4. Re-save Word document for editing
5. Delete headers and footers
6. Delete preamble on page 1
7. Select all and copy
8. Paste into Excel
9. Manually delete blank lines
10. Save as xlsx document

### Level 8 Courses

In [35]:
# assign name for saving data
path = 'data/cao2019_l8' + nowstr + '.tsv'

In [36]:
# Convert downlaoded excel to dataframe
df2019L8 = pd.read_excel('data/cao2019_20211102_200630.xlsx')

In [72]:
# Save as csv
df2019L8.to_csv(path, sep= '\t')

In [65]:
# Rename columns to match other dfs
df2019L8.columns = columns[:-1]

In [77]:
df2019L8["Level"] = "8"

In [76]:
df2019L8.head()

Unnamed: 0,Code,Title,Points,MidPoints,Level
0,AL801,Software Design with Virtual Reality and Gaming,304,328.0,8
1,AL802,Software Design with Cloud Computing,301,306.0,8
2,AL803,Software Design with Mobile Apps and Connected...,309,337.0,8
3,AL805,Network Management and Cloud Infrastructure,329,442.0,8
4,AL810,Quantity Surveying,307,349.0,8


### Level 6/7 Courses

In [73]:
# assign name for saving data
path = 'data/cao2019_l67' + nowstr + '.tsv'

In [74]:
# Convert downlaoded excel to dataframe
df2019L67 = pd.read_excel('data/cao2019_20211202_193530_l67.xlsx')

In [69]:
# Save as csv
df2019L67.to_csv(path, sep= '\t')

In [70]:
# Rename columns to match other dfs
df2019L67.columns = columns[:-1]

In [78]:
df2019L67["Level"] = "6/7"

In [79]:
df2019L67.head()

Unnamed: 0,Course Code,INSTITUTION and COURSE,EOS,Mid,Level
0,AL600,Software Design,205,306.0,6/7
1,AL601,Computer Engineering,196,272.0,6/7
2,AL602,Mechanical Engineering,258,424.0,6/7
3,AL604,Civil Engineering,252,360.0,6/7
4,AL630,Pharmacy Technician,306,366.0,6/7


---
# End