# CAO Points Analysis Using Pandas 

## Fundamentals of Data Analysis Assignment

### Andrew Walker - G00398788@gmit.ie

![https://pandas.pydata.org](https://pandas.pydata.org/docs/_static/pandas.svg)

In [1]:
# Convenient HTTP requests.
#import requests as rq

# Regular expressions.
#import re

# Dates and times.
#import datetime as dt

# Data frames.
#import pandas as pd

# For downloading.
#import urllib.request as urlrq

# Ian's example up to Excel Files and PDFs week

In [2]:
# Regular expressions.
import re

# Convenient HTTP requests.
import requests as rq

# Dates and times
import datetime as dt

# Data frames
import pandas as pd

# FOr downloading
import urllib.request as urlrq

<br>

## 2021 Points

http://www2.cao.ie/points/l8.php

***


In [3]:
# Fetch the CAO points URL.
resp = rq.get('http://www2.cao.ie/points/l8.php')
# Have a look
resp

<Response [200]>

### Save original data set

In [4]:
# get the current date and time
now = dt.datetime.now()

# format as a string
nowstr = now.strftime('%Y%m%d_%H%M%S')

In [5]:
# Create a file path for the original data
path = 'data/cao2021_LvL8_csv_' + nowstr + '.html'

<br>

## Error on server 

Technically, the server says we should decode as per:
            Content-Type: text/html; charset=iso-8859-1
However, one line uses \x96 which isn't defined in in iso-8859-1.
Therefore using similar decoding standard cp1252 which includes \x96.

In [6]:
# The server uses the wrong encoding, fix it
original_encoding = resp.encoding
# change to cp1252
resp.encoding = 'cp1252'

In [7]:
# Save the original html file
with open(path, 'w') as f:
    f.write(resp.text)

In [8]:
# Compile the regular expression for matching lines
re_course = re.compile(r'([A-Z]{2}[0-9]{3}) (.*)([0-9]{3})(\*?) *')

In [9]:
# File path for the csv file
path = 'data/cao2021_LvL8_csv_' + nowstr + '.csv'

# Keep track of how many courses we process
no_lines = 0

# OPen the csv file for writing
with open(path, 'w') as f:
    # Loop through lines of the response
    for line in resp.iter_lines():
        # Decode the line, using the wrong encoding
        dline = line.decode('cp1252')
        # match only the lines we want - the ones representing courses
        if re_course.fullmatch(dline):
            # add 1 to the lines counter
            no_lines = no_lines + 1
            # split the line on 2 or more spaces
            linesplit = re.split('  +', dline)
            # Rejoin the substrings with commas in between
            f.write(','.join(linesplit) + '\n')
        
        
# Print the total number of processed lines        
print(f"Total number of lines is {no_lines}.")

Total number of lines is 922.


<br>

## 2020 Points

***

https://www.cao.ie/index.php?page=points&p=2020&bb=points

<br>

#### Save Original File

In [10]:
# Create a file path for the original data
path = 'data/cao2020_' + nowstr + '.xlsx'

In [11]:
# Save original file to disk
urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', path)

('data/cao2020_20211125_081926.xlsx',
 <http.client.HTTPMessage at 0x1594eb3b430>)

<br>

#### Load Spreadsheet using Pandas

In [18]:
# Download and parse the excel spreadsheet
df2020 = pd.read_excel('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', skiprows=10)

In [19]:
df2020


Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,Manufacturing and processing,Manufacturing Engineering,WD208,188,,,,188,,339,...,,,,,,,,,,
1460,Information and Communication Technologies (ICTs),Software Systems Development,WD210,279,,,,279,,337,...,,,,,,,,,,
1461,Information and Communication Technologies (ICTs),Creative Computing,WD211,271,,,,271,,318,...,,,,,,,,,,
1462,Personal services,Recreation and Sport Management,WD212,270,,,,270,,349,...,,,,,,,,,,


In [14]:
# Spot check a random row
df2020.iloc[753]

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Road Transport Technology and Management
COURSE CODE2                                                           LC286
R1 POINTS                                                                264
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      264
EOS Random *                                                             NaN
EOS Mid-point                                                            360
LEVEL                                                                      7
HEI                                         Limerick Institute of Technology
Test/Interview #                                                         NaN

In [15]:
#Spot check the last but one row
df2020.iloc[-2]

CATEGORY (i.e.ISCED description)                    Personal services
COURSE TITLE                          Recreation and Sport Management
COURSE CODE2                                                    WD212
R1 POINTS                                                         270
R1 Random *                                                       NaN
R2 POINTS                                                         NaN
R2 Random*                                                        NaN
EOS                                                               270
EOS Random *                                                      NaN
EOS Mid-point                                                     349
LEVEL                                                               8
HEI                                 Waterford Institute of Technology
Test/Interview #                                                  NaN
avp                                                               NaN
v                   

In [16]:
# Create a file path for the pandas data
path = 'data/cao2020_' + nowstr + '.csv'

In [17]:
# Save pandas data frame to disk
df2020.to_csv(path)

<br>

## 2019 Points

http://www2.cao.ie/points/lvl8_19.pdf

https://www.cao.ie/index.php?page=points&p=2019&bb=points

***


#### Steps to Reproduce

1. Download original pdf file
2. Open original pdf file in MS Word
3. Save MS Word's converted pdf in docx format
4. Re-save MS Word document for editing
5. Delete headers and footers
6. Delete preamble on page 1
7. Select all and copy
8. Paste into Notepad++
9. Remove HEI name headings and paste onto each course line
10. Delete blank lines
11. Replace double tabs with single tabs
12. Change backticks to apostrophes
13. Remove any extra tabs at end of line


In [68]:
df2019 = pd.read_csv('data/cao2019_20211124_080300_edited.csv', sep='\t')

In [69]:
df2019

Unnamed: 0,Course Code,INSTITUTION and COURSE,EOS,Mid
Athlone Institute of Technology,AL801,Software Design with Virtual Reality and Gaming,304,328.0
Athlone Institute of Technology,AL802,Software Design with Cloud Computing,301,306.0
Athlone Institute of Technology,AL803,Software Design with Mobile Apps and Connected...,309,337.0
Athlone Institute of Technology,AL805,Network Management and Cloud Infrastructure,329,442.0
Athlone Institute of Technology,AL810,Quantity Surveying,307,349.0
...,...,...,...,...
Waterford Institute of Technology,WD200,Arts (options),221,296.0
Waterford Institute of Technology,WD210,Software Systems Development,271,329.0
Waterford Institute of Technology,WD211,Creative Computing,275,322.0
Waterford Institute of Technology,WD212,Recreation and Sport Management,274,311.0


# My attempt Level 6/7 2021 data

# Fetch the CAO points URL.
resp = rq.get('http://www2.cao.ie/points/l76.php')
# Have a look
resp

# get the current date and time
now = dt.datetime.now()

# format as a string
nowstr = now.strftime('%Y%m%d_%H%M%S')

# Create a file path for the original data
path = 'data/cao2021_LvL7_LvL6_' + nowstr + '.html'

# Save the original html file
with open(path, 'w') as f:
    f.write(resp.text)

# Compile the regular expression for matching lines
#re_course = re.compile(r'([A-Z]{2}[0-9]{3}) (.*)([0-9]{3})(\*?) *')
#re_course = re.compile(r'([A-Z]{2}[0-9]{3}) (.*)([\w]{3})(\*?) *') 
#re_course = re.compile(r'([A-Z]{2}[0-9]{3}) (.*)([\w]{3})(\w*?) *') - this does not include row with zero entries in the 20/21 columns, and some entries are split into 5 columns 

# this has changed from Ian's so that there are 3 spaces so that course titles with more than 2 spaces are kept in single cell
# but courses with no CAO points number are still not included
#re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([\w]{3})(\w*?) *')  

# this gets more lines 416 compared to 410
#re_course = re.compile(r'([A-Z]{2}[0-9,\s]{3})  (.*)([\w]{3})*')  

re_course = re.compile(r'([A-Z]{2}[0-9,\s]{3})  (.*)([\w]{3})(\D?)*')  


 

# File path for the csv file
path = 'data/cao2021_LvL7_LvL6_' + nowstr + '.csv'

# Keep track of how many courses we process
no_lines = 0

# OPen the csv file for writing
with open(path, 'w') as f:
    # Loop through lines of the response
    for line in resp.iter_lines():
        # Decode the line, using the wrong encoding
        dline = line.decode('cp1252')
        # match only the lines we want - the ones representing courses
        if re_course.fullmatch(dline):
            # add 1 to the lines counter
            no_lines = no_lines + 1
            # split the line on 3 or more spaces - this has changed from Ian's
            #linesplit = re.split('  +', dline)
            linesplit = re.split('[E*]', dline)
            # Rejoin the substrings with commas in between
            f.write(','.join(linesplit) + '\n')
            
               
# Print the total number of processed lines        
print(f"Total number of lines is {no_lines}.")

import re

s_nums = 'one1two22three333four'

print(re.split('[o]', s_nums))
# ['one', 'two', 'three', 'four']
