# Loading CAO points information into a pandas data frame, using CAO website

http://www.cao.ie/index.php?page=points&p=2021

***

## INTRODUCTION

## WHAT IS CAO

### LOADING THE DATA

In [1]:
# HTTP request
import requests as rq
# Regular expressions
import re
# Dates and time
import datetime as dt
# Data frames
import pandas as pd
# For downloading
import urllib.request as urlrq
# PDF
import camelot

## LEVEL 8 POINTS 2019, 2020, 2021
### ROUND 1 AND ROUND 2

In [2]:
# Current date and time
now = dt.datetime.now()
# Format as a string
nowstr = (now.strftime("%Y%m%d_%H%M%S"))

In [3]:
# Regular expression
re_course = re.compile(r"([A-Z]{2}[0-9]{3})(.*)")

In [4]:
# Defining a dictionary
    # Using website archive to access the wanted data (2019, 2020)
years_dict = {
    "2019": [("data/cao2019_" +  nowstr), "https://web.archive.org/web/20191019135815/http://www2.cao.ie:80/points/l8.php"],
    "2020": [("data/cao2020_" +  nowstr), "https://web.archive.org/web/20201108133105/http://www2.cao.ie/points/l8.php"],
    "2021": [("data/cao2021_" +  nowstr), "http://www2.cao.ie/points/l8.php"]
}

In [5]:
# Loop through the (dict) years
for year, content in years_dict.items():
    # Fetch the CAO points URL
    rq.get (content[1])
    resp = rq.get (content[1])
    # The server uses the wrong encoding
        # Change to "cp1252"
    resp.encoding = "cp1252"
    # Check if OK:
        #Response [200] means OK
    print (resp)
    # Save the original html file
    with open(content[0] + ".html", "w") as f:
        f.write(resp.text)
    # Keep track of how many courses we process
    no_lines = 0
    # Iterating through the lines
    resp.iter_lines()
    resps = resp.iter_lines()
    # Open the csv file for writing.
    with open(content[0] + ".csv", "w") as f:
        # Write a header row.
        f.write(','.join(["CODE", "TITLE", "R1_POINTS", "R2_POINTS"]) + "\n")
        # Loop through lines of the response.
        for line in resps:
            # Decode the line, using the wrong encoding
            dline = line.decode("cp1252")
            # Match only the lines representing courses
            if re_course.fullmatch(dline):
                # Add one to the lines counter
                no_lines = no_lines + 1
                # The course code
                course_code = dline[:5]
                # The course title
                course_title = dline[7:57]
                # Round one points
                course_points = re.split(' +', dline[60:])
                if len(course_points) != 2:
                    course_points = course_points[:2]
                # Join the fields using a comma
                linesplit = [course_code, course_title, course_points[0], course_points[1]]
                # Rejoin the substrings with commas in between
                f.write(",".join(linesplit) + "\n")   
    # Print the total number of processed lines
    print(f"Total number of lines is {no_lines}.")

<Response [200]>
Total number of lines is 930.
<Response [200]>
Total number of lines is 961.
<Response [200]>
Total number of lines is 949.


# READ AND JOIN ALL TABLES - TESTING

In [6]:
df2019 = pd.read_csv(("data/cao2019_" +  nowstr + ".csv"), encoding='cp1252')
df2019

Unnamed: 0,CODE,TITLE,R1_POINTS,R2_POINTS
0,AL801,Software Design with Virtual Reality and Gamin...,304,
1,AL802,Software Design with Cloud Computing ...,301,
2,AL803,Software Design with Mobile Apps and Connected...,309,
3,AL805,Network Management and Cloud Infrastructure ...,329,
4,AL810,Quantity Surveying ...,307,
...,...,...,...,...
925,WD200,Arts (options) ...,AQA,AQA
926,WD210,Software Systems Development ...,271,
927,WD211,Creative Computing ...,275,
928,WD212,Recreation and Sport Management ...,274,


In [7]:
df2020 = pd.read_csv(("data/cao2020_" +  nowstr + ".csv"), encoding='cp1252')
df2020

Unnamed: 0,CODE,TITLE,R1_POINTS,R2_POINTS
0,AL801,Software Design with Virtual Reality and Gamin...,303,
1,AL802,Software Design with Artificial Intelligence f...,332,
2,AL803,Software Design with Mobile Apps and Connected...,337,
3,AL805,Computer Engineering with Network Infrastructu...,333,
4,AL810,Quantity Surveying ...,319,
...,...,...,...,...
956,WD200,Arts (options) ...,AQA,AQA
957,WD210,Software Systems Development ...,279,
958,WD211,Creative Computing ...,271,
959,WD212,Recreation and Sport Management ...,270,


In [8]:
df2021 = pd.read_csv(("data/cao2021_" +  nowstr + ".csv"), encoding='cp1252')
df2021

Unnamed: 0,CODE,TITLE,R1_POINTS,R2_POINTS
0,AL801,Software Design for Virtual Reality and Gaming...,300,
1,AL802,Software Design in Artificial Intelligence for...,313,
2,AL803,Software Design for Mobile Apps and Connected ...,350,
3,AL805,Computer Engineering for Network Infrastructur...,321,
4,AL810,Quantity Surveying ...,328,
...,...,...,...,...
944,WD211,Creative Computing ...,270,
945,WD212,Recreation and Sport Management ...,262,
946,WD230,Mechanical and Manufacturing Engineering ...,230,230
947,WD231,Early Childhood Care and Education ...,266,


In [9]:
courses2019 = df2019[["CODE", "TITLE"]]
courses2019

Unnamed: 0,CODE,TITLE
0,AL801,Software Design with Virtual Reality and Gamin...
1,AL802,Software Design with Cloud Computing ...
2,AL803,Software Design with Mobile Apps and Connected...
3,AL805,Network Management and Cloud Infrastructure ...
4,AL810,Quantity Surveying ...
...,...,...
925,WD200,Arts (options) ...
926,WD210,Software Systems Development ...
927,WD211,Creative Computing ...
928,WD212,Recreation and Sport Management ...


In [10]:
courses2020 = df2020[["CODE", "TITLE"]]
courses2020

Unnamed: 0,CODE,TITLE
0,AL801,Software Design with Virtual Reality and Gamin...
1,AL802,Software Design with Artificial Intelligence f...
2,AL803,Software Design with Mobile Apps and Connected...
3,AL805,Computer Engineering with Network Infrastructu...
4,AL810,Quantity Surveying ...
...,...,...
956,WD200,Arts (options) ...
957,WD210,Software Systems Development ...
958,WD211,Creative Computing ...
959,WD212,Recreation and Sport Management ...


In [11]:
courses2021 = df2021[["CODE", "TITLE"]]
courses2021

Unnamed: 0,CODE,TITLE
0,AL801,Software Design for Virtual Reality and Gaming...
1,AL802,Software Design in Artificial Intelligence for...
2,AL803,Software Design for Mobile Apps and Connected ...
3,AL805,Computer Engineering for Network Infrastructur...
4,AL810,Quantity Surveying ...
...,...,...
944,WD211,Creative Computing ...
945,WD212,Recreation and Sport Management ...
946,WD230,Mechanical and Manufacturing Engineering ...
947,WD231,Early Childhood Care and Education ...


In [12]:
allcourses = pd.concat([courses2019, courses2020, courses2021])
allcourses

Unnamed: 0,CODE,TITLE
0,AL801,Software Design with Virtual Reality and Gamin...
1,AL802,Software Design with Cloud Computing ...
2,AL803,Software Design with Mobile Apps and Connected...
3,AL805,Network Management and Cloud Infrastructure ...
4,AL810,Quantity Surveying ...
...,...,...
944,WD211,Creative Computing ...
945,WD212,Recreation and Sport Management ...
946,WD230,Mechanical and Manufacturing Engineering ...
947,WD231,Early Childhood Care and Education ...


In [13]:
allcourses.drop_duplicates(subset=["CODE"], inplace=True, ignore_index=True)

In [14]:
allcourses

Unnamed: 0,CODE,TITLE
0,AL801,Software Design with Virtual Reality and Gamin...
1,AL802,Software Design with Cloud Computing ...
2,AL803,Software Design with Mobile Apps and Connected...
3,AL805,Network Management and Cloud Infrastructure ...
4,AL810,Quantity Surveying ...
...,...,...
1141,SG349,Electronics and Self Driving Technologies ...
1142,SG350,Robotics and Automation ...
1143,TL874,Inclusive Sport and Physical Activity ...
1144,WD231,Early Childhood Care and Education ...


In [15]:
#allcourses.to_csv ("all.csv")

In [16]:
df2019.columns = ["CODE","TITLE", "R1_POINTS_2019", "R2_POINTS_2019"]
df2019.set_index("CODE", inplace=True)
df2019

Unnamed: 0_level_0,TITLE,R1_POINTS_2019,R2_POINTS_2019
CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AL801,Software Design with Virtual Reality and Gamin...,304,
AL802,Software Design with Cloud Computing ...,301,
AL803,Software Design with Mobile Apps and Connected...,309,
AL805,Network Management and Cloud Infrastructure ...,329,
AL810,Quantity Surveying ...,307,
...,...,...,...
WD200,Arts (options) ...,AQA,AQA
WD210,Software Systems Development ...,271,
WD211,Creative Computing ...,275,
WD212,Recreation and Sport Management ...,274,


In [17]:
df2020.columns = ["CODE","TITLE", "R1_POINTS_2020", "R2_POINTS_2020"]
df2020.set_index("CODE", inplace=True)
df2020

Unnamed: 0_level_0,TITLE,R1_POINTS_2020,R2_POINTS_2020
CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AL801,Software Design with Virtual Reality and Gamin...,303,
AL802,Software Design with Artificial Intelligence f...,332,
AL803,Software Design with Mobile Apps and Connected...,337,
AL805,Computer Engineering with Network Infrastructu...,333,
AL810,Quantity Surveying ...,319,
...,...,...,...
WD200,Arts (options) ...,AQA,AQA
WD210,Software Systems Development ...,279,
WD211,Creative Computing ...,271,
WD212,Recreation and Sport Management ...,270,


In [18]:
df2021.columns = ["CODE","TITLE", "R1_POINTS_2021", "R2_POINTS_2021"]
df2021.set_index("CODE", inplace=True)
df2021

Unnamed: 0_level_0,TITLE,R1_POINTS_2021,R2_POINTS_2021
CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AL801,Software Design for Virtual Reality and Gaming...,300,
AL802,Software Design in Artificial Intelligence for...,313,
AL803,Software Design for Mobile Apps and Connected ...,350,
AL805,Computer Engineering for Network Infrastructur...,321,
AL810,Quantity Surveying ...,328,
...,...,...,...
WD211,Creative Computing ...,270,
WD212,Recreation and Sport Management ...,262,
WD230,Mechanical and Manufacturing Engineering ...,230,230
WD231,Early Childhood Care and Education ...,266,


In [19]:
allcourses.set_index("CODE", inplace=True)
allcourses = allcourses.join(df2019[["R1_POINTS_2019"]])
allcourses

Unnamed: 0_level_0,TITLE,R1_POINTS_2019
CODE,Unnamed: 1_level_1,Unnamed: 2_level_1
AL801,Software Design with Virtual Reality and Gamin...,304
AL802,Software Design with Cloud Computing ...,301
AL803,Software Design with Mobile Apps and Connected...,309
AL805,Network Management and Cloud Infrastructure ...,329
AL810,Quantity Surveying ...,307
...,...,...
SG349,Electronics and Self Driving Technologies ...,
SG350,Robotics and Automation ...,
TL874,Inclusive Sport and Physical Activity ...,
WD231,Early Childhood Care and Education ...,


In [20]:
allcourses = allcourses.join(df2020[["R1_POINTS_2020"]])
allcourses

Unnamed: 0_level_0,TITLE,R1_POINTS_2019,R1_POINTS_2020
CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AL801,Software Design with Virtual Reality and Gamin...,304,303
AL802,Software Design with Cloud Computing ...,301,332
AL803,Software Design with Mobile Apps and Connected...,309,337
AL805,Network Management and Cloud Infrastructure ...,329,333
AL810,Quantity Surveying ...,307,319
...,...,...,...
SG349,Electronics and Self Driving Technologies ...,,
SG350,Robotics and Automation ...,,
TL874,Inclusive Sport and Physical Activity ...,,
WD231,Early Childhood Care and Education ...,,


In [21]:
allcourses = allcourses.join(df2021[["R1_POINTS_2021"]])
allcourses

Unnamed: 0_level_0,TITLE,R1_POINTS_2019,R1_POINTS_2020,R1_POINTS_2021
CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AL801,Software Design with Virtual Reality and Gamin...,304,303,300
AL802,Software Design with Cloud Computing ...,301,332,313
AL803,Software Design with Mobile Apps and Connected...,309,337,350
AL805,Network Management and Cloud Infrastructure ...,329,333,321
AL810,Quantity Surveying ...,307,319,328
...,...,...,...,...
SG349,Electronics and Self Driving Technologies ...,,,
SG350,Robotics and Automation ...,,,384
TL874,Inclusive Sport and Physical Activity ...,,,341
WD231,Early Childhood Care and Education ...,,,266


In [47]:
allcourses.sort_values("CODE", inplace = True)
allcourses
allcourses.to_csv ("all.csv")

# END TEST

## LEVEL 8 POINTS 2020, 2019
### EOS AND MED
http://www.cao.ie/index.php?page=points&p=2020

### 2020 .xlsx

In [23]:
# Create a file path for the original data
path = ("data/cao2020_eos" +  nowstr + ".xlsx")

In [24]:
urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', path)

('data/cao2020_eos20211115_092445.xlsx',
 <http.client.HTTPMessage at 0x1f5895e3bb0>)

In [25]:
# Download and parse the excel spreadsheet
df = pd.read_excel("http://www2.cao.ie/points/CAOPointsCharts2020.xlsx", skiprows = 10)
df

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [26]:
# Spotcheck random row
df.iloc[753]

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Road Transport Technology and Management
COURSE CODE2                                                           LC286
R1 POINTS                                                                264
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      264
EOS Random *                                                             NaN
EOS Mid-point                                                            360
LEVEL                                                                      7
HEI                                         Limerick Institute of Technology
Test/Interview #                                                         NaN

In [27]:
# Spotcheck last row
df.iloc[-1]

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Mechanical and Manufacturing Engineering
COURSE CODE2                                                           WD230
R1 POINTS                                                                253
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      253
EOS Random *                                                             NaN
EOS Mid-point                                                            369
LEVEL                                                                      8
HEI                                        Waterford Institute of Technology
Test/Interview #                                                         NaN

In [28]:
# Create a file path for the pandas data
path = ("data/cao2020_eos" +  nowstr + ".csv")

In [29]:
# Save pandas data frame to disk
df.to_csv(path)

***

### 2019 .pdf

### GET THE ORIGINAL

In [30]:
path = 'data/cao2019_eos' + nowstr + '.pdf'

In [31]:
resp_pdf = rq.get("http://www2.cao.ie/points/lvl8_19.pdf")
resp_pdf

<Response [200]>

In [32]:
with open(path, 'wb') as f:
    f.write(resp_pdf.content)

### READ THE TABLE FROM PDF

In [33]:
file = path
tables = camelot.read_pdf(file, pages = "all", flavor = "lattice")

In [34]:
print ("Tables:", tables.n)

Tables: 18


https://stackoverflow.com/questions/52383287/concatenate-dataframes-in-a-for-loop

In [35]:
table_total = []
for x in range (0,18):
    df = tables[x].df
    table_total.append(df)
    
table = pd.concat(table_total)
table.to_csv("data/cao2019_eos" +  nowstr + ".csv", index = False) 

### DATA COMPARISON

# CONCLUSION

***

## REFERENCES

## LINKS TO CHECK