#  CAO Points Analysis


https://www.independent.ie/life/family/learning/understanding-your-cao-course-guide-26505318.html


The Mid point is the points score of the applicant in the middle of a list of offerees placed in points score order


Where the letters AQA appear instead of points, it means that all qualified applicants for a course were offered a place. AQA sometimes suggests that the points of the last qualified applicant being offered a place were less than 100.


# Imports

In [None]:
# Regular expressions 
import re
# Convenient HTTP requests
import requests as rq
# Dates and times
import datetime as dt
# Import numpy module
import numpy as np
# For downloading.
import urllib.request as urlrq
import urllib.parse as urlpar
# Import tabula to read table in pdf
import tabula as tb
# Import mathplotlib
import matplotlib.pyplot as plt
# Import time
import time
# import pandas
import pandas as pd 
# import seaborn
import seaborn as sns
# import warnings
import warnings
warnings.filterwarnings("ignore")

# Error Checks

In [None]:
########################################################################
# Set Datetime Globally for referencing
########################################################################
# Get the current date and time
now = dt.datetime.now()

# format as a string
nowstr = now.strftime('%Y%m%d_%H%M%S')

print(nowstr)

In [None]:
########################################################################
# Function to test URLs
# https://pytutorial.com/check-url-is-reachable
########################################################################
def url_checker(url):
	try:
		#Get Url
		resp = rq.get(url)
		# if the request succeeds
		if resp.status_code == 404:
			print(f"{url}: is not reachable")
		else:
			print(f"{url}: is reachable")
			

	#Exception
	except rq.exceptions.RequestException as e:
        # print URL with Errs
		raise SystemExit(print(f"{url}: is Not reachable \nErr: {e}"))

In [None]:
########################################################################
# Function to save CAO Webpages 
########################################################################

def htmlcopy(url):
    # Fetch the CAO points URL
    resp = rq.get(url)
    # Check connection '<Response [200]>' means OK
    if resp.status_code == 200:
        print(f"{url}: is reachable")

        path = 'data/' + nowstr + '_CAO_Webpage_' + url[-4:] + '.html'
        print(str(path))

        # Save the original html file.
        with open(path, 'w') as f:
            f.write(resp.text)
    else:
        print(f"{url}: is not reachable")
    

In [None]:
########################################################################
# Function to save files
########################################################################

def caosavefile(url):
    split = urlpar.urlsplit(url)
    path = 'data/' + nowstr + '_CAO_file_' + split.path.split("/")[-1]
    print(str(path))
    urlrq.urlretrieve(url, path)

In [None]:
########################################################################
# CAO web pages
# https://www.cao.ie/index.php?page=points&p=2018
# https://www.cao.ie/index.php?page=points&p=2019
# https://www.cao.ie/index.php?page=points&p=2020
# https://www.cao.ie/index.php?page=points&p=2021
########################################################################

html2018 = 'https://www.cao.ie/index.php?page=points&p=2018'
html2019 = 'https://www.cao.ie/index.php?page=points&p=2019'
html2020 = 'https://www.cao.ie/index.php?page=points&p=2020'
html2021 = 'https://www.cao.ie/index.php?page=points&p=2021'

caopointshtml = [html2018,html2019,html2020,html2021]

for url in caopointshtml:
    htmlcopy(url)

In [None]:
########################################################################
# CAO points files
# http://www2.cao.ie/points/lvl8_18.pdf
# http://www2.cao.ie/points/lvl76_18.pdf
# http://www2.cao.ie/points/lvl8_19.pdf
# http://www2.cao.ie/points/lvl76_19.pdf
# http://www2.cao.ie/points/CAOPointsCharts2020.xlsx
# http://www2.cao.ie/points/CAOPointsCharts2021.xlsx
########################################################################
CAO2021 = 'http://www2.cao.ie/points/CAOPointsCharts2021.xlsx'
CAO2020 = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'
CAO2019_8 = 'http://www2.cao.ie/points/lvl8_19.pdf'
CAO2019_76 = 'http://www2.cao.ie/points/lvl76_19.pdf'
CAO2018_8 = 'http://www2.cao.ie/points/lvl8_18.pdf'
CAO2018_76 = 'http://www2.cao.ie/points/lvl76_18.pdf'

########################################################################
# List of URL 
# Use function to test if available
########################################################################
caopointslist = [CAO2021,CAO2020,CAO2019_8, CAO2019_76,CAO2018_8, CAO2018_76]

for url in caopointslist:
    url_checker(url)
    caosavefile(url)
    

# 2021 Points
# Load the spreadsheet using pandas

https://www.cao.ie/index.php?page=points&p=2021

In [35]:
# Download and parse the excel spreadsheet.
# 1451 rows × 15 columns
# Skip first 10 Rows
# # https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
df = pd.read_excel(CAO2021, skiprows=11)


DF2021 = df[['Course Code','Course Title','R1 Points','R2 Points ','EOS Points','EOS Midpoints', 'CATEGORY (ISCED Description)', 'HEI']] 

DF2021 = DF2021.rename(columns={'Course Code': 'Code', 'Course Title': 'Title', 'R1 Points': 'R1 Points', 'R2 Points ': 'R2 Points', 'EOS Points': 'EOS',
        'EOS Midpoints': 'Mid', 'CATEGORY (ISCED Description)': 'Category', 'HEI': 'College'})

DF2021

Unnamed: 0,Code,Title,R1 Points,R2 Points,EOS,Mid,Category,College
0,AL605,Music and Instrument Technology,211,,211,319,Engineering and engineering trades,Athlone Institute of Technology
1,AL630,Pharmacy Technician,308,,308,409,Health,Athlone Institute of Technology
2,AL631,Dental Nursing,311,,311,400,Health,Athlone Institute of Technology
3,AL632,Applied Science,297,,297,454,Biological and related sciences,Athlone Institute of Technology
4,AL650,Business,AQA,AQA,AQA,351,Business and administration,Athlone Institute of Technology
...,...,...,...,...,...,...,...,...
1446,WD211,Creative Computing,270,,270,392,Information and Communication Technologies (ICTs),Waterford Institute of Technology
1447,WD212,Recreation and Sport Management,262,,262,304,Personal services,Waterford Institute of Technology
1448,WD230,Mechanical and Manufacturing Engineering,230,230,230,361,Engineering and engineering trades,Waterford Institute of Technology
1449,WD231,Early Childhood Care and Education,266,,266,366,Welfare,Waterford Institute of Technology


# 2020 Points
# Load the spreadsheet using pandas

https://www.cao.ie/index.php?page=points&p=2020


In [34]:
# Download and parse the excel spreadsheet.
# 1464 rows × 23 columns
# Skip first 10 Rows
df = pd.read_excel(CAO2020, skiprows=10)


DF2020 = df[['COURSE CODE2','COURSE TITLE','R1 POINTS','R2 POINTS','EOS','EOS Mid-point','CATEGORY (i.e.ISCED description)', 'HEI']] 

DF2020 = DF2020.rename(columns={'COURSE CODE2': 'Code', 'COURSE TITLE': 'Title', 'R1 POINTS': 'R1 Points', 'R2 POINTS': 'R2 Points', 'EOS Mid-point': 'Mid',
            'CATEGORY (i.e.ISCED description)': 'Category', 'HEI': 'College'})

DF2020

Unnamed: 0,Code,Title,R1 Points,R2 Points,EOS,Mid,Category,College
0,AC120,International Business,209,,209,280,Business and administration,American College
1,AC137,Liberal Arts,252,,252,270,Humanities (except languages),American College
2,AD101,"First Year Art & Design (Common Entry,portfolio)",#+matric,,#+matric,#+matric,Arts,National College of Art and Design
3,AD102,Graphic Design and Moving Image Design (portfo...,#+matric,,#+matric,#+matric,Arts,National College of Art and Design
4,AD103,Textile & Surface Design and Jewellery & Objec...,#+matric,,#+matric,#+matric,Arts,National College of Art and Design
...,...,...,...,...,...,...,...,...
1459,WD208,Manufacturing Engineering,188,,188,339,Manufacturing and processing,Waterford Institute of Technology
1460,WD210,Software Systems Development,279,,279,337,Information and Communication Technologies (ICTs),Waterford Institute of Technology
1461,WD211,Creative Computing,271,,271,318,Information and Communication Technologies (ICTs),Waterford Institute of Technology
1462,WD212,Recreation and Sport Management,270,,270,349,Personal services,Waterford Institute of Technology


# 2019 Points
# Load the spreads pdf

https://www.cao.ie/index.php?page=points&p=2019


CAO2019_8 = 'http://www2.cao.ie/points/lvl8_19.pdf'
CAO2019_76 = 'http://www2.cao.ie/points/lvl76_19.pdf'
CAO2018_8 = 'http://www2.cao.ie/points/lvl8_18.pdf'
CAO2018_76 = 'http://www2.cao.ie/points/lvl76_18.pdf'


In [None]:
csvfiles = []

def pdfpoints(url):
    df = tb.read_pdf(url, pages='all')
    split = urlpar.urlsplit(url)
    csvpath = 'data/' + nowstr + '_CAO_file_' + split.path.split("/")[-1]
    tb.convert_into( path , csvpath, output_format="csv", pages='all')

In [38]:
# Save file path for the orginal data
path = 'data/cao2019_' + nowstr + '.pdf'

# get data from cao and save to path
urlrq.urlretrieve('http://www2.cao.ie/points/lvl8_19.pdf', path)


#df2019 = pd.read_csv('data/cao2019_20211029_113930_edited.csv', sep='\t')

('data/cao2019_20211220_152529.pdf',
 <http.client.HTTPMessage at 0x2436ba3e9a0>)

In [55]:
# Download and parse the pdf with tabula module
# 
# import module tabula
# Documentation
# https://pypi.org/project/tabula-py/
df = tb.read_pdf('http://www2.cao.ie/points/lvl8_19.pdf', pages='15', guess=True)


tables = tb.read_pdf('http://www2.cao.ie/points/lvl8_19.pdf', output_format="json", pages=16, silent=True)
top = tables[0]["top"]
left = tables[0]["left"]
bottom = tables[0]["height"] + top
right = tables[0]["width"] + left
print(f"{top=}\n{bottom=}\n{left=}\n{right=}")

# top=76.331535
# bottom=760.7189251367188
# left=57.589046
# right=526.9946246132813


# area = [top, left, bottom, right]
# Example from page 2 json output: area = [30.0, 59.0, 761.0, 491.0]
# You could then nudge these locations slightly to include a wider data area:


df

top=76.33153
bottom=762.8789543164063
left=57.589046
right=526.9946246132813


[    LC362       Biotechnology and Biopharmaceutical Sciences   379    402
 0   LC371             Creative Broadcast and Film Production   298  339.0
 1   LC372                    Music Technology and Production   321  369.0
 2   LC374        Renewable and Electrical Energy Engineering   317  368.0
 3   LC375            Industrial Automation & Robotic Systems   321  368.0
 4   LC376                             Electronic Engineering   308  346.0
 5   LC380    Automotive Engineering and Transport Management   282  329.0
 6   LC392                          Beauty and Spa Management   217  318.0
 7   LC393                   Early Childhood Education & Care   260  309.0
 8   LC401         Early Childhood Education & Care (Thurles)   298  387.0
 9   LC402                         Social Care Work (Thurles)   209  290.0
 10  LC408  Environmental and Geographical Sciences (Thurles)   262  346.0
 11  LC411                 Marketing and Management (Thurles)   226  300.0
 12  LC418  Computing - G

In [40]:
# Create csv path
csvpath = 'data/cao2019_' + nowstr + '.csv'

# Convert pdf to csv
tb.convert_into( path , csvpath, output_format="csv", pages='all')

In [None]:
# Read csv as a pandas dataframe
df2019 = pd.read_csv(csvpath, sep=',')

print(df2019)

In [None]:
# All row with colleges have NAN 
# Find all rows that have NAN as a value
# https://www.geeksforgeeks.org/working-with-missing-data-in-pandas/

# creating bool series True for NaN values 
bool_series = pd.isnull(df2019["Course Code"]) 
    
# filtering data 
# displaying data only with Course Code = NaN 
df2019[bool_series]


# End