# CAO Points Analysis
## Author: Ross Downey

http://www2.cao.ie/points/l8.php
***


In [12]:
# HTTP Requests Python Module
import requests as rq

# Regular expressions module
import re

# Dates and time module
import datetime as dt

# Pandas
import pandas as pd

# Downloading
import urllib.request as urlrq

<br>

## 2021 Points

***

In [2]:
# Retrieving the 2021 CAO from this URL using the requests module
resp = rq.get('http://www2.cao.ie/points/l8.php')

# Veryifing Python has downloaded the data from the URL, [200] means it's ok
resp

<Response [200]>

<br>  

## Ensuring original dataset is saved
***

In [3]:
# Saving current date and time as a function
now = dt.datetime.now()

# Convert date and time to a string
nowstr = now.strftime('%Y%m%d_%H%M%S')

In [4]:
# Filepath to be created in data folder to save original data
filepath = 'data/cao2021_' + nowstr + '.html'

<br>

## Error on Server
***
Issue with the character set defined on the CAO server.  
The server says we should decode as per:  
    Content-Type: text/html; charset=iso-8859-1  
On decoding as per 'iso-8859-1' it was noted that one line is using \x96,  
which is not defined in iso-8859-1.  
The similar decoding standard 'cp1252' will be used, which has a codepoint for \x96.

In [5]:
# Amending the original encoding
original_encoding = resp.encoding

# Changing to 'cp1252'
resp.encoding = 'cp1252'

In [6]:
# Saving the original file downloaded from server
with open(filepath, 'w') as f:
    f.write(resp.text)

In [7]:
# Compiling the regular expression for the matching lines.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')
# This gives us only the lines in the original HTML code that have CAO points

<br>

## Explanation of Regular Expression
***
re.compile 
r = converting html flags to raw strings
[A-Z]{2} = Everything with 2 letters
[0-9]{3} = Followed by 3 numbers, this is each course code e.g. AL801

Two spaces followed by (.*) is space then wildcard, this is the course title string

[0-9]{3} This is the number of points required

(\*?) is a wildcard search, but some of these are actual asterisks in the html so using question mark

and asterisk at the end also as a wildcard



In [8]:
# Creating filepath to save the new csv file
filepath = 'data/cao2021_csv_' + nowstr + '.csv'


# Adding a counter to total the number of courses processed
no_lines = 0

# Opening filepath to save new csv file
with open(filepath, 'w') as f:
    
# Looping through the lines of the data 
    for line in resp.iter_lines():
        
        # Decode the line but using the 'wrong' encoding
        dline = line.decode('cp1252')
    
        # Using regular expression to match lines with courses and points only
        if re_course.fullmatch(dline):
        
            # Adding 1 for each line matched to give an overall total using the counter
            no_lines = no_lines + 1
        
            # Split the line based on number of spaces (i.e. two spaces or more) using '  +'
            linesplit = re.split('  +', dline)
        
            # Rejoin the substrings after splitting but with commas in between
            # This removes all of the spaces in the original data
            # Rejoined and using '\n' to add a new line for each course
            f.write(','.join(linesplit) + '\n')
        
# Displaying the total number of lines processed
print(f"Total number of lines is {no_lines}. ")

Total number of lines is 922. 


<br>

## Cleaning up csv file to remove special characters  
Need to remove any special characters ( '#', '@' etc.) from the data to peform analysis on the 

In [9]:
pwd

'C:\\Users\\downe\\FODA2021\\FODA2021'

In [10]:
# https://stackoverflow.com/questions/18039057/python-pandas-error-tokenizing-data
Column_Names = ['Course Code', 'Course Title', 
                'Round 1 Points', 'Round 2 Points', 'Round 3 Points']
x = pd.read_csv(r"C:/Users/downe/FODA2021/FODA2021/data/cao2021_csv_20211030_112255.csv",
                sep='delimiter', header=None, engine ='python')
df = [x, columns = ['Code', 'Title','Rd 1 Points', 'Rd 2 Points', 'Rd 3 Points' ]]

SyntaxError: invalid syntax (<ipython-input-10-b411253cfa81>, line 6)

<br>

## 2020 Points
***

https://www.cao.ie/index.php?page=points&p=2020&bb=points

<br>

### Saving Original File from CAO website

***

In [14]:
# Creating filepath for original 2020 data from CAO site
# File extension set as xlsx as original is in excel format
filepath2020 = 'data/cao2020_' + nowstr + '.xlsx'

In [15]:
urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx',
                  filepath2020)

('data/cao2020_20211102_192348.xlsx',
 <http.client.HTTPMessage at 0x2716fd1acd0>)

<br>

### Loading 2020 Spreadsheet into Notebook using Pandas

***

In [16]:
# Downloading and Parsing the spreadsheet
# Skipping initial 10 rows as no relevant data contained within
df2020 = pd.read_excel('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', skiprows=10)

In [21]:
# Checking first five rows for formatting
df2020.head()

Unnamed: 0,CATEGORY (i.e.ISCED description),COURSE TITLE,COURSE CODE2,R1 POINTS,R1 Random *,R2 POINTS,R2 Random*,EOS,EOS Random *,EOS Mid-point,...,avp,v,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
0,Business and administration,International Business,AC120,209,,,,209,,280,...,,,,,,,,,,
1,Humanities (except languages),Liberal Arts,AC137,252,,,,252,,270,...,,,,,,,,,,
2,Arts,"First Year Art & Design (Common Entry,portfolio)",AD101,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
3,Arts,Graphic Design and Moving Image Design (portfo...,AD102,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,
4,Arts,Textile & Surface Design and Jewellery & Objec...,AD103,#+matric,,,,#+matric,,#+matric,...,,,,,,,,,,


In [19]:
# Spot checking random row
# Index 333 is the same as row 345 in original excel file
# Removed 10 rows, 1 row for header and 1 row for pandas index starting at 0
# Spot check aligns with original excel file
df2020.iloc[333]

CATEGORY (i.e.ISCED description)                       Arts
COURSE TITLE                        Film and Creative Media
COURSE CODE2                                          DB576
R1 POINTS                                               AQA
R1 Random *                                             NaN
R2 POINTS                                               AQA
R2 Random*                                              NaN
EOS                                                     AQA
EOS Random *                                            NaN
EOS Mid-point                                           338
LEVEL                                                     7
HEI                                  Dublin Business School
Test/Interview #                                        NaN
avp                                                     avp
v                                                       NaN
Column1                                                 NaN
Column2                                 

In [20]:
# Spot Check last row
# Aligns with original excel file
df2020.iloc[-1]

CATEGORY (i.e.ISCED description)          Engineering and engineering trades
COURSE TITLE                        Mechanical and Manufacturing Engineering
COURSE CODE2                                                           WD230
R1 POINTS                                                                253
R1 Random *                                                              NaN
R2 POINTS                                                                NaN
R2 Random*                                                               NaN
EOS                                                                      253
EOS Random *                                                             NaN
EOS Mid-point                                                            369
LEVEL                                                                      8
HEI                                        Waterford Institute of Technology
Test/Interview #                                                         NaN

In [22]:
# Creating file path for 2020 data in csv format (Note file extension)
filepath2020 = 'data/cao2020_' + nowstr + '.csv'

In [24]:
# Saving dataframe
df2020.to_csv(filepath2020)

<br>

## 2019 Points
***

https://www.cao.ie/index.php?page=points&p=2019&bb=points

Steps taken to convert original pdf file to pandas dataframe
<br>
1. Original PDF file downloaded from link above
2. PDF file converted to Microsoft Word file
3. Microsoft Word file saved in docx format
4. Second copy of Word file saved as "_edited"
5. Headers, Footers and initial introduction deleted
6. Selected all and pasted into Notepad++
7. Removed all HEI names as had their own rows
8. Removed all special characters (#* etc.) using "Find and Replace"
9. Changed backticks (`) to apostrophes (') using "Find and Replace"
10. Changed column header from "Course and Institution" to "Course"
11. Extra "NaN" columns noted, removed double tab from course code AL870 (l.28)


In [26]:
df2019 = pd.read_csv('data/cao2019_20211102_194800_edited.csv', sep='\t')

In [27]:
df2019

Unnamed: 0,Course Code,COURSE,EOS,Mid,Unnamed: 4,Unnamed: 5
0,AL801,Software Design with Virtual Reality and Gaming,304,328.0,,
1,AL802,Software Design with Cloud Computing,301,306.0,,
2,AL803,Software Design with Mobile Apps and Connected...,309,337.0,,
3,AL805,Network Management and Cloud Infrastructure,329,442.0,,
4,AL810,Quantity Surveying,307,349.0,,
...,...,...,...,...,...,...
925,WD200,Arts (options),221,296.0,,
926,WD210,Software Systems Development,271,329.0,,
927,WD211,Creative Computing,275,322.0,,
928,WD212,Recreation and Sport Management,274,311.0,,


***
## End