# Loading CAO points information into a pandas data frame, using CAO website

http://www.cao.ie/index.php?page=points&p=2021

***

## INTRODUCTION

## WHAT IS CAO

### LOADING THE DATA

In [1]:
# HTTP request
import requests as rq
# Regular expressions
import re
# Dates and time
import datetime as dt
# Data frames
import pandas as pd
# For downloading
import urllib.request as urlrq

## 2021 POINTS

In [2]:
resp = rq.get ("http://www2.cao.ie/points/l8.php")
print (resp) #200 means ok

<Response [200]>


## SAVE THE ORIGINAL DATA

In [3]:
# Current date and time
now = dt.datetime.now()

# Format as a string
nowstr = (now.strftime("%Y%m%d_%H%M%S"))

In [4]:
# Create a file path for the original data
path = ("data/cao2021_" +  nowstr + ".html")

In [5]:
# Fixing the wrong server encoding
original_encoding = resp.encoding
# Change to cp1252
resp.encoding = "cp1252"

In [6]:
# Save the original html file
with open(path, "w") as f:
    f.write(resp.text)

## REGULAR EXPRESSIONS

In [7]:
re_course = re.compile(r"([A-Z]{2}[0-9]{3})(.*)([0-9]{3})(\*?) *")

## LOOP THROUGH THE LINES

In [8]:
path = ("data/cao2021_csv_" +  nowstr + ".csv")
    
no_lines = 0
            
with open (path, "w") as f:
    # Loop
    for line in resp.iter_lines():
        dline = (line.decode("cp1252"))
        # Match only lines with courses
        if re_course.fullmatch(dline):
            # Add one to the lines counter
            no_lines = no_lines + 1
            #print (line)
            # Pick out the relevant lines
            # csv_version = re_course.sub(r"\1,\2,\3,\4", dline)
            # Print the CSV
            # print (csv_version)
            linesplit = re.split ("  +", dline)
            # print (",".join(linesplit))
            # f.write (csv_version + "\n")
            f.write (",".join(linesplit) + "\n")
        
print ("Total number of lines:", no_lines)

Total number of lines: 922


## 2020 POINTS
http://www.cao.ie/index.php?page=points&p=2020

In [9]:
# Create a file path for the original data
path = ("data/cao2020_" +  nowstr + ".xlsx")

In [10]:
urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', path)

('data/cao2020_20211104_133503.xlsx',
 <http.client.HTTPMessage at 0x28380b77790>)

In [11]:
# Download and parse the excel spreadsheet
df = pd.read_excel("http://www2.cao.ie/points/CAOPointsCharts2020.xlsx", skiprows = 10)

In [12]:
df

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [13]:
# Spotcheck random row
df.iloc[753]

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Road Transport Technology and Management
COURSE CODE2                                                           LC286
R1 POINTS                                                                264
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      264
EOS Random *                                                             NaN
EOS Mid-point                                                            360
LEVEL                                                                      7
HEI                                         Limerick Institute of Technology
Test/Interview #                                                         NaN

In [14]:
# Spotcheck last row
df.iloc[-1]

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Mechanical and Manufacturing Engineering
COURSE CODE2                                                           WD230
R1 POINTS                                                                253
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      253
EOS Random *                                                             NaN
EOS Mid-point                                                            369
LEVEL                                                                      8
HEI                                        Waterford Institute of Technology
Test/Interview #                                                         NaN

In [15]:
# Create a file path for the pandas data
path = ("data/cao2020_" +  nowstr + ".csv")

In [16]:
# Save pandas data frame to disk
df.to_csv(path)

#### Regular expressions

### CAO DATA 2019, 2020, 2021

### DATA COMPARISON

# CONCLUSION

***

## REFERENCES

## LINKS TO CHECK