# CAO Points Analysis
## Acquiring the data
***

### 2021 [link](http://www.cao.ie/index.php?page=points&p=2021)

In [1]:
# import required libraries
import requests as rq             # Convenient HTTP requests.
import re                         # Regular expressions.
import datetime as dt    
import pandas as pd
import numpy as np

In [2]:
# Make a get request to 2021 CAO points URL
resp = rq.get('http://www2.cao.ie/points/l8.php')

# Check that value = 200 (in which case a connection is made).
resp

<Response [200]>

<br>

**Encoding note:<br>Error on server** <br>
The Server indicated one should decode as per: Content-Type: text/html; charset=iso-8859-1. However, one line of the html uses<br> 
\x96 which isn't defined in iso-8859-1. Therefore the similar decoding standard cp1252 will be  used. This standard is very similar but<br>
includes \x96.

In [3]:
# Specify encoding method 
resp.encoding = 'cp1252'

# Create a file path for the original data.
html_path = 'data/cao_2021_raw_data.html'

# Save original html file
with open(html_path, 'w') as f:
    f.write(resp.text)

In [4]:

def scrape_html_from_website(path, regex):
    
    num_lines = 0                                       # Keep track of courses counted in loop
    with open(path, 'w') as f:                          # Open the csv file for writing.          
        for line in resp.iter_lines():                      # Loop through lines of the response 
            dline = line.decode('cp1252')                       # decode the line 
            if regex.fullmatch(dline):                          # Match only the lines representing courses.       
                num_lines = num_lines + 1                           # Add one to the lines counter if match occured      
                dline_adj = re.sub(" ", "  ", dline, count = 1)     # Substitute first space character with double space     
                linesplit = re.split('   +', dline_adj)             # Split the line on three or more spaces.  
                f.write(','.join(linesplit) + '\n')                 # Rejoin the substrings with commas in between.  
                
    print('Total number of lines is lines is {}.\nThis was manually verified against\
"cao_2021_raw_data.html.'.format(num_lines))  
    
# Compile the regular expression for matching lines
    # ([A-Z]{2}[0-9]{3}) matches A-Z x2 characters, 0-9  x3 characters
    # (.*) Matches any single character except newline zero or more times 
re_course = re.compile(r'([A-Z]{2}[0-9]{3})(.*)')  

# write parsed data to csv file
write_path = 'data/cao_2021_final_data.csv'         
scrape_html_from_website(write_path, re_course)

Total number of lines is lines is 949.
This was manually verified against"cao_2021_raw_data.html.


### 2020 [link](http://www2.cao.ie/points/CAOPointsCharts2020.xlsx)
***

In [5]:
url2020 = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'

# request url and write data to disc
resp = rq.get(url2020)                  

# open file for writing in binary mode
output = open('data/cao_2020_raw_data.xlsx', 'wb')    
output.write(resp.content)
output.close()

In [6]:
def data_handler_2020(url):       
    df = pd.read_excel(url, skiprows = 10)               # read excel & skip first 10 rows (irrelevant)
    df = df.iloc[: , :-8]                                # drop last 8 columns              
    return df

df_2020 = data_handler_2020(url2020)

# Save dataframe to disk.
write_path = 'data/cao_2020_final_data.csv'
df_2020.to_csv(write_path)

In [7]:
# Spot check a random row.
print(df_2020.iloc[189])

# Spot check last row.
print(df_2020.iloc[-1])

CATEGORY (i.e.ISCED description)                                                Arts
COURSE TITLE                        Popular Music: Drums at CIT Cork School of Music
COURSE CODE2                                                                   CR126
R1 POINTS                                                                        801
R1 Random *                                                                      NaN
R2 POINTS                                                                        NaN
R2 Random*                                                                       NaN
EOS                                                                              801
EOS Random *                                                                     NaN
EOS Mid-point                                                                    940
LEVEL                                                                              8
HEI                                                     Cork Inst

### 2019

In [8]:
import urllib3
import pdfplumber
import io


def scrape_pdf_from_website(url):
    http = urllib3.PoolManager()                                   # instansiate pool manager object for sending requests
    temp_binary = io.BytesIO()                                     # create instance for storing binary data
    temp_binary.write(http.request("GET", url).data)               # request url data and write to temp_binary storage
    
    data_final = ''  
    with pdfplumber.open(temp_binary) as pdf:                      # open storage object      
        for page in pdf.pages:                                         # iterate pages of pdf object
            data = page.extract_text()                                 # extract text of page
            data_final += data                                         # concatenates text to data_final variable
    return data_final

url  = 'http://www2.cao.ie/points/lvl8_19.pdf'
pdf_text = scrape_pdf_from_website(url)

In [9]:
def data_handler_2019(regex, line_splitter, text):
    
    df_data = []                                                   # placeholder to add list of rows to
    #print(text)
    #for char in text:                                         # iterate file chars  
    for match in re.finditer(regex, text):                    # iterate matches each line
        course_code = match.group()[0:6]                           # isolate course code
        rest_of_line =  match.group()[6:]                          # isolate all of line after course code 
        rest_of_line = re.split(line_splitter ,rest_of_line)           # split lines using passed criteria
        df_data.append([course_code] + rest_of_line)
                        
    df = pd.DataFrame(df_data)                                # convert array to dataframe
    return df


# regex statement (will isolate all lines starting with course code)
regex = re.compile(r'([A-Z]{2}[0-9]{3})(.*)')
line_split_conditions = '(#\d{3}\* |#\d{2}\* |\d{3}\* |\d{2}\* |\d{3} |\d{2} |#\d{3} |#\d{2} |# \+matric|#)'

df_2019 = data_handler_2019(regex, line_split_conditions, pdf_text)
df_2019.to_csv('data/cao_2019_final.csv')

print('NB: it was verified as of 26/11/2021 that there were {} courses exactly in the CAO 2019 points list.'
      .format(len(df_2019.index)))


NB: it was verified as of 26/11/2021 that there were 930 courses exactly in the CAO 2019 points list.


In [10]:
# Spot check a random row.
print(df_2019.iloc[50])

# Spot chcek last row.
print(df_2019.iloc[-1])

0        CW708 
1    Law - LLB 
2          298 
3           328
Name: 50, dtype: object
0                                       WD230 
1    Mechanical and Manufacturing Engineering 
2                                         273 
3                                          348
Name: 929, dtype: object
