# CAO Points Notebook
***

In [19]:
# Convenient HTTP requests
import requests as rq # Requests
# Regular Expressions
import re # Regular Expressions
# Dates and times
import datetime as dt
# Pandas for dataframe
import pandas as pd
# For downloading files
import urllib.request as urlrq

<br>

## 2021 Points
[https://www.cao.ie/index.php?page=points&p=2021&bb=points](https://www.cao.ie/index.php?page=points&p=2021&bb=points)
***

In [2]:
# Fetch the CAO URL
resp = rq.get('http://www2.cao.ie/points/l8.php')
# Testing this request
resp

<Response [200]>

In [3]:
# Get the current date and time
now = dt.datetime.now()
# Save the date and time as a string so it can be reused
now_str = now.strftime('%Y%m%d_%H%M%S')
# Create a file path for the original data using the date and time
path = 'cao2021_' + now_str + '.html'

In [4]:
# Server uses the wrong encoding, we need to change it from iso-8859-1 to cp1252
original_encoding = resp.encoding
original_encoding
resp.encoding = 'cp1252'

In [5]:
# Save the file
with open(path, 'w') as f:
    f.write(resp.text)

In [6]:
# Compile the regular expression for matching lines with course info
# re_course = re.compile('([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *') # Ian's code
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)   +(.*)  +(.*)')

In [7]:
# Loop through the lines of the response content
no_lines = 0
for line in resp.iter_lines():
    if re_course.match(line.decode('iso-8859-1')):
        #print(line)
        no_lines = no_lines +1
      
# Check how many lines are being captured
print (no_lines)

949


In [8]:
csv_path = 'cao2021_re_' + now_str + '.csv'
with open(csv_path,'w') as f:
    
    # Now loop through and print out the lines matching the RE
    for line in resp.iter_lines():
        if re_course.match(line.decode('cp1252')):
            csv_version = re_course.sub(r'\1,\2,\3,\4', line.decode('cp1252'))
            #print(csv_version)
            #print(line)
            f.write(csv_version + '\n')
            
            # Using the line split method
            # Split the lines on two or more spaces
            #linesplit = re.split('  +',line.decode('cp1252'))
            #print(','.join(linesplit))
            #f.write(','.join(linesplit) + '\n') 

<br>

## 2020 Points
[https://www.cao.ie/index.php?page=points&p=2020&bb=points](https://www.cao.ie/index.php?page=points&p=2020&bb=points)
***

In [21]:
# Create a filepath for the 2020 points file
path = 'cao2020_' + now_str + '.xlsx'

# Save original data file to disk
urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx',path)

('cao2020_20211101_191342.xlsx', <http.client.HTTPMessage at 0x2709741a850>)

In [15]:
# Download and parse the excel spreadsheet
df = pd.read_excel('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx',skiprows=10)
df

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [22]:
# Create a filepath for the pandas dataframe
path = 'cao2020_df' + now_str + '.csv'

# Save the dataframe
df.to_csv(path)