# CAO Points Notebook
***

#### Required Modules

In [1]:
# Convenient HTTP requests
import requests as rq # Requests

# Regular Expressions
import re # Regular Expressions

# Dates and times
import datetime as dt

# Pandas for dataframes
import pandas as pd

# For downloading files
import urllib.request as urlrq

<br>

## 2021 Level 8 Points
[https://www.cao.ie/index.php?page=points&p=2021&bb=points](https://www.cao.ie/index.php?page=points&p=2021&bb=points)
***

In [2]:
# Fetch the CAO URL
resp = rq.get('http://www2.cao.ie/points/l8.php')
# Testing this request - we want to get 200 back
resp

<Response [200]>

In [3]:
# Get the current date and time
now = dt.datetime.now()

# Save the date and time as a string so it can be reused
now_str = now.strftime('%Y%m%d_%H%M%S')

# Create a file path for the original data using the date and time
path21 = 'cao2021_' + now_str + '.html'

In [4]:
# Server uses the wrong encoding, we need to change it from iso-8859-1 to cp1252
original_encoding = resp.encoding
original_encoding
resp.encoding = 'cp1252'

In [5]:
# Save the file
with open(path21, 'w') as f:
    f.write(resp.text)

In [6]:
# Compile the regular expression for matching lines with course info
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.{1,53})   +(.{0,5})  +(.{0,5})')

In [7]:
# Loop through the lines of the response content
no_lines = 0
for line in resp.iter_lines():
    if re_course.match(line.decode('iso-8859-1')):
        #print(line)
        no_lines = no_lines +1
      
# Check how many lines are being captured - manual checks showed this should be 949
print (no_lines)

949


In [8]:
csv_path = 'cao2021_re_' + now_str + '.csv'
with open(csv_path,'w') as f:
    
    # Now loop through and print out the lines matching the RE
    for line in resp.iter_lines():
        if re_course.match(line.decode('cp1252')):
            csv_version = re_course.sub(r'\1,\2,\3,\4', line.decode('cp1252'))
            #print(csv_version)
            #print(line)
            f.write(csv_version + '\n')
            
            # Using the line split method - leaving for now in case I want to use later
            # Split the lines on two or more spaces
            #linesplit = re.split('  +',line.decode('cp1252'))
            #print(','.join(linesplit))
            #f.write(','.join(linesplit) + '\n') 

In [9]:
# Create a df for the 2021 data with column names
df21 = pd.read_csv(csv_path, header=None, names=["Course_Code", "Course_Title", "Points_R1", "Points_R2"])
df21

Unnamed: 0,Course_Code,Course_Title,Points_R1,Points_R2
0,AL801,Software Design for Virtual Reality and Gaming...,300,
1,AL802,Software Design in Artificial Intelligence for...,313,
2,AL803,Software Design for Mobile Apps and Connected ...,350,
3,AL805,Computer Engineering for Network Infrastructur...,321,
4,AL810,Quantity Surveying ...,328,
...,...,...,...,...
944,WD211,Creative Computing ...,270,
945,WD212,Recreation and Sport Management ...,262,
946,WD230,Mechanical and Manufacturing Engineering ...,230,230
947,WD231,Early Childhood Care and Education ...,266,


<br>

## 2020 Level 8 Points
[https://www.cao.ie/index.php?page=points&p=2020&bb=points](https://www.cao.ie/index.php?page=points&p=2020&bb=points)
***

In [10]:
# Create a filepath for the 2020 points file
path20 = 'cao2020_' + now_str + '.xlsx'

# Save original data file to disk
urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx',path20)

('cao2020_20211108_195154.xlsx', <http.client.HTTPMessage at 0x21cf9f0e850>)

In [11]:
# Download and parse the excel spreadsheet
df20 = pd.read_excel('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx',skiprows=10)
df20

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [12]:
# Create a filepath for the pandas dataframe
dfpath20 = 'cao2020_df' + now_str + '.csv'

# Save the dataframe
df20.to_csv(dfpath20)

<br>

## 2019 Level 8 Points
[http://www.cao.ie/index.php?page=points&p=2019](http://www.cao.ie/index.php?page=points&p=2019)
***

##### Steps to Reproduce Data
1. Download pdf from the CAO website (see link above)  
2. Open pdf in Microsoft Word  
3. Save file in .docx format  
4. Save another copy of the word doc for editing  
5. Delete headers and footers  
6. Delete preamble on first page  
7. Select all and copy  
8. Paste into Notepad++  
9. Cut HEI names and paste onto beginning of each applicable course line, followed by a tab  
10. Delete blank lines
11. Delete blank lines  
12. Replace double tabs with a single tab  
13. Change backticks to apostrophes  

In [13]:
df19 = pd.read_csv("cao2019_03112021.csv", sep='\t')

In [14]:
df19

Unnamed: 0,HEI,Course Code,INSTITUTION and COURSE,EOS,Mid
0,Athlone Institute of Technology,AL801,Software Design with Virtual Reality and Gaming,304,328.0
1,Athlone Institute of Technology,AL802,Software Design with Cloud Computing,301,306.0
2,Athlone Institute of Technology,AL803,Software Design with Mobile Apps and Connected...,309,337.0
3,Athlone Institute of Technology,AL805,Network Management and Cloud Infrastructure,329,442.0
4,Athlone Institute of Technology,AL810,Quantity Surveying,307,349.0
...,...,...,...,...,...
925,Waterford Institute of Technology,WD200,Arts (options),221,296.0
926,Waterford Institute of Technology,WD210,Software Systems Development,271,329.0
927,Waterford Institute of Technology,WD211,Creative Computing,275,322.0
928,Waterford Institute of Technology,WD212,Recreation and Sport Management,274,311.0


<br>

#### Concat & Join the Dataframes
***

In [15]:
courses21 = df21[["Course_Code", "Course_Title"]]

In [16]:
courses21

Unnamed: 0,Course_Code,Course_Title
0,AL801,Software Design for Virtual Reality and Gaming...
1,AL802,Software Design in Artificial Intelligence for...
2,AL803,Software Design for Mobile Apps and Connected ...
3,AL805,Computer Engineering for Network Infrastructur...
4,AL810,Quantity Surveying ...
...,...,...
944,WD211,Creative Computing ...
945,WD212,Recreation and Sport Management ...
946,WD230,Mechanical and Manufacturing Engineering ...
947,WD231,Early Childhood Care and Education ...


In [21]:
courses20 = df20[["COURSE CODE2", "COURSE TITLE"]]
# Change the colum names to match the 2021 df
courses20.columns = ["Course_Code", "Course_Title"]
courses20

Unnamed: 0,Course_Code,Course_Title
0,AC120,International Business
1,AC137,Liberal Arts
2,AD101,"First Year Art & Design (Common Entry,portfolio)"
3,AD102,Graphic Design and Moving Image Design (portfo...
4,AD103,Textile & Surface Design and Jewellery & Objec...
...,...,...
1459,WD208,Manufacturing Engineering
1460,WD210,Software Systems Development
1461,WD211,Creative Computing
1462,WD212,Recreation and Sport Management


In [20]:
courses19 = df19[["Course Code", "HEI"]]
# Change the colum names to match the 2021 df
courses19.columns = ["Course_Code", "Course_Title"]
courses19

Unnamed: 0,Course_Code,Course_Title
0,AL801,Athlone Institute of Technology
1,AL802,Athlone Institute of Technology
2,AL803,Athlone Institute of Technology
3,AL805,Athlone Institute of Technology
4,AL810,Athlone Institute of Technology
...,...,...
925,WD200,Waterford Institute of Technology
926,WD210,Waterford Institute of Technology
927,WD211,Waterford Institute of Technology
928,WD212,Waterford Institute of Technology


In [23]:
# Combine the 3 dfs into 1 with concat
all_courses = pd.concat([courses21, courses20, courses19])
all_courses

Unnamed: 0,Course_Code,Course_Title
0,AL801,Software Design for Virtual Reality and Gaming...
1,AL802,Software Design in Artificial Intelligence for...
2,AL803,Software Design for Mobile Apps and Connected ...
3,AL805,Computer Engineering for Network Infrastructur...
4,AL810,Quantity Surveying ...
...,...,...
925,WD200,Waterford Institute of Technology
926,WD210,Waterford Institute of Technology
927,WD211,Waterford Institute of Technology
928,WD212,Waterford Institute of Technology


In [25]:
# Show us which courses are duplicated
all_courses[all_courses.duplicated()]

Unnamed: 0,Course_Code,Course_Title
3,AD102,Graphic Design and Moving Image Design (portfo...
196,CR220,Fine Art at CIT Crawford College of Art and De...
246,CW068,Applied Social Studies in Professional Social ...
813,LM076,Product Design and Technology (portfolio requi...
1100,TR034,Management Science and Information Systems Stu...
1104,TR040,Middle Eastern and European Languages and Cult...


In [27]:
# Returns a copy of the df with duplicates removed
all_courses.drop_duplicates()

Unnamed: 0,Course_Code,Course_Title
0,AL801,Software Design for Virtual Reality and Gaming...
1,AL802,Software Design in Artificial Intelligence for...
2,AL803,Software Design for Mobile Apps and Connected ...
3,AL805,Computer Engineering for Network Infrastructur...
4,AL810,Quantity Surveying ...
...,...,...
925,WD200,Waterford Institute of Technology
926,WD210,Waterford Institute of Technology
927,WD211,Waterford Institute of Technology
928,WD212,Waterford Institute of Technology


In [30]:
# Returns all courses where course code was duplicated (i.e. course was available each year)
all_courses[all_courses.duplicated(subset=["Course_Code"])]

Unnamed: 0,Course_Code,Course_Title
0,AC120,International Business
1,AC137,Liberal Arts
2,AD101,"First Year Art & Design (Common Entry,portfolio)"
3,AD102,Graphic Design and Moving Image Design (portfo...
4,AD103,Textile & Surface Design and Jewellery & Objec...
...,...,...
925,WD200,Waterford Institute of Technology
926,WD210,Waterford Institute of Technology
927,WD211,Waterford Institute of Technology
928,WD212,Waterford Institute of Technology
