# CAO Points Analysis
## Acquiring the data
***

### 2021 [link](http://www.cao.ie/index.php?page=points&p=2021)

In [1]:
# import required libraries
import requests as rq             # Convenient HTTP requests.
import re                         # Regular expressions.
import datetime as dt    
import pandas as pd
import numpy as np

In [2]:
# Make a get request to 2021 CAO points URL
resp = rq.get('http://www2.cao.ie/points/l8.php')

# Check that value = 200 (in which case a connection is made).
resp

<Response [200]>

<br>

**Encoding note:<br>Error on server** <br>
The Server indicated one should decode as per: Content-Type: text/html; charset=iso-8859-1. However, one line of the html uses<br> 
\x96 which isn't defined in iso-8859-1. Therefore the similar decoding standard cp1252 will be  used. This standard is very similar but<br>
includes \x96.

In [3]:
# Specify encoding method 
resp.encoding = 'cp1252'

# Create a file path for the original data.
html_path = 'data/cao_2021_raw_data.html'

# Save original html file
with open(html_path, 'w') as f:
    f.write(resp.text)

In [4]:

def scrape_html_from_website(path, regex):
    
    num_lines = 0                                       # Keep track of courses counted in loop
    with open(path, 'w') as f:                          # Open the csv file for writing.          
        for line in resp.iter_lines():                      # Loop through lines of the response 
            dline = line.decode('cp1252')                       # decode the line 
            if regex.fullmatch(dline):                          # Match only the lines representing courses.       
                num_lines = num_lines + 1                           # Add one to the lines counter if match occured      
                course_code = dline[:5]                             # isolate course code 
                course_title = dline[7:57].strip()                  # isolate course title         
                course_points = re.split(' +', dline[60:])          # split string when on one or more spaces               
                
                if len(course_points) != 2:                   # if list length not equal to 2         
                    course_points = course_points[:2]         # then retain first two elements of list
                   
                linesplit = [course_code, course_title, course_points[0], course_points[1]]
                f.write(','.join(linesplit) + '\n')
    
    print('Total number of lines is lines is {}.\nThis was manually verified against\
"cao_2021_raw_data.html.'.format(num_lines))  
    
# Compile the regular expression for matching lines
    # ([A-Z]{2}[0-9]{3}) matches A-Z x2 characters, 0-9  x3 characters
    # (.*) Matches any single character except newline zero or more times 
re_course = re.compile(r'([A-Z]{2}[0-9]{3})(.*)')  

# write parsed data to csv file
write_path = 'data/cao_2021_final_data.csv'         
scrape_html_from_website(write_path, re_course)

Total number of lines is lines is 949.
This was manually verified against"cao_2021_raw_data.html.


### 2020 [link](http://www2.cao.ie/points/CAOPointsCharts2020.xlsx)
***

In [5]:
url2020 = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'

# request url and write data to disc
resp = rq.get(url2020)                  

# open file for writing in binary mode
output = open('data/cao_2020_raw_data.xlsx', 'wb')    
output.write(resp.content)
output.close()

In [6]:
def data_handler_2020(url):       
    df = pd.read_excel(url, skiprows = 10)               # read excel & skip first 10 rows (irrelevant)
    df = df.iloc[: , :-8]                                # drop last 8 columns              
    return df

df_2020 = data_handler_2020(url2020)

# Save dataframe to disk.
write_path = 'data/cao_2020_final_data.csv'
df_2020.to_csv(write_path, encoding = "utf-8")

In [7]:
# Spot check a random row.
print(df_2020.iloc[189])

# Spot check last row.
print(df_2020.iloc[-1])

CATEGORY (i.e.ISCED description)                                                Arts
COURSE TITLE                        Popular Music: Drums at CIT Cork School of Music
COURSE CODE2                                                                   CR126
R1 POINTS                                                                        801
R1 Random *                                                                      NaN
R2 POINTS                                                                        NaN
R2 Random*                                                                       NaN
EOS                                                                              801
EOS Random *                                                                     NaN
EOS Mid-point                                                                    940
LEVEL                                                                              8
HEI                                                     Cork Inst

### 2019

In [8]:
import urllib3
import pdfplumber
import io


def scrape_pdf_from_website(url):
    http = urllib3.PoolManager()                                   # instansiate pool manager object for sending requests
    temp_binary = io.BytesIO()                                     # create instance for storing binary data
    temp_binary.write(http.request("GET", url).data)               # request url data and write to temp_binary storage
    
    data_final = ''  
    with pdfplumber.open(temp_binary) as pdf:                      # open storage object      
        for page in pdf.pages:                                         # iterate pages of pdf object
            data = page.extract_text()                                 # extract text of page
            data_final += data                                         # concatenates text to data_final variable
    return data_final

url  = 'http://www2.cao.ie/points/lvl8_19.pdf'
pdf_text = scrape_pdf_from_website(url)

In [9]:
def data_handler_2019(regex, line_splitter, text):
    
    df_data = []                                                   # placeholder to add list of rows to
    #print(text)
    #for char in text:                                         # iterate file chars  
    for match in re.finditer(regex, text):                    # iterate matches each line
        course_code = match.group()[0:6]                           # isolate course code
        rest_of_line =  match.group()[6:]                          # isolate all of line after course code 
        rest_of_line = re.split(line_splitter ,rest_of_line)           # split lines using passed criteria
        df_data.append([course_code] + rest_of_line)
                        
    df = pd.DataFrame(df_data)                                # convert array to dataframe
    return df


# regex statement (will isolate all lines starting with course code)
regex = re.compile(r'([A-Z]{2}[0-9]{3})(.*)')
line_split_conditions = '(#\d{3}\* |#\d{2}\* |\d{3}\* |\d{2}\* |\d{3} |\d{2} |#\d{3} |#\d{2} |# \+matric|#)'

df_2019 = data_handler_2019(regex, line_split_conditions, pdf_text)
df_2019.to_csv('data/cao_2019_final.csv')

print('NB: it was verified as of 26/11/2021 that there were {} courses exactly in the CAO 2019 points list.'
      .format(len(df_2019.index)))


NB: it was verified as of 26/11/2021 that there were 930 courses exactly in the CAO 2019 points list.


In [10]:
# Spot check a random row.
print(df_2019.iloc[50])

# Spot chcek last row.
print(df_2019.iloc[-1])

0        CW708 
1    Law - LLB 
2          298 
3           328
Name: 50, dtype: object
0                                       WD230 
1    Mechanical and Manufacturing Engineering 
2                                         273 
3                                          348
Name: 929, dtype: object


<br>

## Joining the data
***

### Get unique course codes

In [11]:
# read in 2021 data from csv and convert to dataframe
df_2021 =  pd.read_csv('data/cao_2021_final_data.csv', encoding ='cp1252', header = None)

# check dataframe column names
print('2019 df columns are: {}'.format(df_2019.columns))
print('2020 df columns are: {}'.format(df_2020.columns))
print('2021 df columns are: {}'.format(df_2021.columns))

2019 df columns are: RangeIndex(start=0, stop=4, step=1)
2020 df columns are: Index(['CATEGORY (i.e.ISCED description)', 'COURSE TITLE', 'COURSE CODE2',
       'R1 POINTS', 'R1 Random *', 'R2 POINTS', 'R2 Random*', 'EOS',
       'EOS Random *', 'EOS Mid-point', 'LEVEL', 'HEI', 'Test/Interview #',
       'avp', 'v'],
      dtype='object')
2021 df columns are: Int64Index([0, 1, 2, 3], dtype='int64')


In [12]:
# isolate code and title cols (first two cols) accross all datframes
courses_2019 = df_2019[[0, 1]]
courses_2020 = df_2020[['COURSE CODE2','COURSE TITLE']]               
courses_2021 = df_2021[[0, 1]]

In [13]:
# define col_names for dataframes
courses_2019.columns = ['code', 'course_title']
courses_2020.columns = ['code', 'course_title']
courses_2021.columns = ['code', 'course_title']

In [14]:
# append the three dataframes and sort code alphabetically
all_courses = pd.concat([courses_2021, courses_2020, courses_2019], ignore_index = True)
all_courses.sort_values('code')

Unnamed: 0,code,course_title
175,AC120,International Business
949,AC120,International Business
2581,AC120,International Business
950,AC137,Liberal Arts
176,AC137,Liberal Arts
...,...,...
2412,WD230,Mechanical and Manufacturing Engineering
946,WD230,Mechanical and Manufacturing Engineering
3342,WD230,Mechanical and Manufacturing Engineering
947,WD231,Early Childhood Care and Education


In [15]:
# return all rows that are duplicates
all_courses[all_courses.duplicated()]

Unnamed: 0,code,course_title
949,AC120,International Business
950,AC137,Liberal Arts
952,AD102,Graphic Design and Moving Image Design (portfo...
955,AD204,Fine Art (portfolio)
956,AD211,Fashion Design (portfolio)
...,...,...
2404,WD200,Arts (options)
2409,WD210,Software Systems Development
2410,WD211,Creative Computing
2411,WD212,Recreation and Sport Management


In [16]:
# return a dataframe whereby duplicate rows are removed leaving a unique Code list
# i.e row count = concat (3343) - duplicated (739) = unique (2604)
all_courses.drop_duplicates()

Unnamed: 0,code,course_title
0,AL801,Software Design for Virtual Reality and Gaming
1,AL802,Software Design in Artificial Intelligence for...
2,AL803,Software Design for Mobile Apps and Connected ...
3,AL805,Computer Engineering for Network Infrastructure
4,AL810,Quantity Surveying
...,...,...
3338,WD200,Arts (options)
3339,WD210,Software Systems Development
3340,WD211,Creative Computing
3341,WD212,Recreation and Sport Management
