In [None]:
#  CAO Points Analysis

# Imports

In [12]:
# Regular expressions 
import re
# Convenient HTTP requests
import requests as rq
# Dates and times
import datetime as dt
# Import numpy module
import numpy as np
# For downloading.
import urllib.request as urlrq
import urllib.parse as urlpar
# Import tabula to read table in pdf
import tabula as tb
# Import mathplotlib
import matplotlib.pyplot as plt
# Import time
import time
# import seaborn
import seaborn as sns
# import warnings
import warnings
warnings.filterwarnings("ignore")

# Error Checks

In [5]:
########################################################################
# Set Datetime
########################################################################
# Get the current date and time
now = dt.datetime.now()

# format as a string
nowstr = now.strftime('%Y%m%d_%H%M%S')

print(nowstr)

20211220_144859


In [6]:
########################################################################
# Function to test URLs
# https://pytutorial.com/check-url-is-reachable
########################################################################
def url_checker(url):
	try:
		#Get Url
		resp = rq.get(url)
		# if the request succeeds
		if resp.status_code == 404:
			print(f"{url}: is not reachable")
		else:
			print(f"{url}: is reachable")
			

	#Exception
	except rq.exceptions.RequestException as e:
        # print URL with Errs
		raise SystemExit(print(f"{url}: is Not reachable \nErr: {e}"))

In [8]:
########################################################################
# Function to save CAO Webpages 
########################################################################

def htmlcopy(url):
    # Fetch the CAO points URL
    resp = rq.get(url)
    # Check connection '<Response [200]>' means OK
    if resp.status_code == 200:
        print(f"{url}: is reachable")

        path = 'data/' + nowstr + '_CAO_Webpage_' + url[-4:] + '.html'
        print(str(path))

        # Save the original html file.
        with open(path, 'w') as f:
            f.write(resp.text)
    else:
        print(f"{url}: is not reachable")
    

In [13]:
########################################################################
# Function to save files
########################################################################

def caosavefile(url):
    split = urlpar.urlsplit(url)
    path = 'data/' + nowstr + '_CAO_file_' + split.path.split("/")[-1]
    print(str(path))
    urlrq.urlretrieve(url, path)



In [14]:
########################################################################
# CAO web pages
# https://www.cao.ie/index.php?page=points&p=2018
# https://www.cao.ie/index.php?page=points&p=2019
# https://www.cao.ie/index.php?page=points&p=2020
# https://www.cao.ie/index.php?page=points&p=2021
########################################################################

html2018 = 'https://www.cao.ie/index.php?page=points&p=2018'
html2019 = 'https://www.cao.ie/index.php?page=points&p=2019'
html2020 = 'https://www.cao.ie/index.php?page=points&p=2020'
html2021 = 'https://www.cao.ie/index.php?page=points&p=2021'

caopointshtml = [html2018,html2019,html2020,html2021]

for url in caopointshtml:
    htmlcopy(url)

https://www.cao.ie/index.php?page=points&p=2018: is reachable
data/20211220_144859_CAO_Webpage_2018.html
data/20211220_144859_CAO_file_index.php
https://www.cao.ie/index.php?page=points&p=2019: is reachable
data/20211220_144859_CAO_Webpage_2019.html
data/20211220_144859_CAO_file_index.php
https://www.cao.ie/index.php?page=points&p=2020: is reachable
data/20211220_144859_CAO_Webpage_2020.html
data/20211220_144859_CAO_file_index.php
https://www.cao.ie/index.php?page=points&p=2021: is reachable
data/20211220_144859_CAO_Webpage_2021.html
data/20211220_144859_CAO_file_index.php


In [15]:
########################################################################
# CAO points files
# http://www2.cao.ie/points/lvl8_18.pdf
# http://www2.cao.ie/points/lvl76_18.pdf
# http://www2.cao.ie/points/lvl8_19.pdf
# http://www2.cao.ie/points/lvl76_19.pdf
# http://www2.cao.ie/points/CAOPointsCharts2020.xlsx
# http://www2.cao.ie/points/CAOPointsCharts2021.xlsx
########################################################################
CAO2021 = 'http://www2.cao.ie/points/CAOPointsCharts2021.xlsx'
CAO2020 = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx'
CAO2019_8 = 'http://www2.cao.ie/points/lvl8_19.pdf'
CAO2019_76 = 'http://www2.cao.ie/points/lvl76_19.pdf'
CAO2018_8 = 'http://www2.cao.ie/points/lvl8_18.pdf'
CAO2018_76 = 'http://www2.cao.ie/points/lvl76_18.pdf'

########################################################################
# List of URL 
# Use function to test if available
########################################################################
caopointslist = [CAO2021,CAO2020,CAO2019_8, CAO2019_76,CAO2018_8, CAO2018_76]

for url in caopointslist:
    url_checker(url)
    caosavefile(url)
    

http://www2.cao.ie/points/CAOPointsCharts2021.xlsx: is reachable
data/20211220_144859_CAO_file_CAOPointsCharts2021.xlsx
http://www2.cao.ie/points/CAOPointsCharts2020.xlsx: is reachable
data/20211220_144859_CAO_file_CAOPointsCharts2020.xlsx
http://www2.cao.ie/points/lvl8_19.pdf: is reachable
data/20211220_144859_CAO_file_lvl8_19.pdf
http://www2.cao.ie/points/lvl76_19.pdf: is reachable
data/20211220_144859_CAO_file_lvl76_19.pdf
http://www2.cao.ie/points/lvl8_18.pdf: is reachable
data/20211220_144859_CAO_file_lvl8_18.pdf
http://www2.cao.ie/points/lvl76_18.pdf: is reachable
data/20211220_144859_CAO_file_lvl76_18.pdf


# 2021 CAO Points

http://www2.cao.ie/points/CAOPointsCharts2021.xlsx

<br>

# Use regular expressions to select lines required
***

# Online Web tools used

Test regex on 
https://pythex.org/

Compare output of files using notepad ++ plugin
http://www.technicaloverload.com/compare-two-files-using-notepad/

In [None]:
# Compile the regular expression for matching lines with courses
# Documentation for re (regular expression)
# https://docs.python.org/3/library/re.html

# Copy of cao.csv orginal copy shows circa 949 lines

# Test of orginal re = 949
#re_course  = re.compile('([A-Z]{2}[0-9]{3})  .*([0-9]{3})')
#re_course  = re.compile('(\w{2}\d{3})\s{2}.*([0-9]{3})')
re_course = re.compile(r'([A-Z]{2}[0-9]{3})(.*)')

# Test of new re = 922
# re_course = re.compile(r'([A-Z]{2}[0-9]{3})  .*(\d{3}|[AQA]) *')

# Test of re = 949
# re_course  = re.compile('([A-Z]{2}[0-9]{3}).*(\#?|[0-9]{3}|[AQA]|\*?)')

# Test of re = 836
#re_course  = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)(\#?|[0-9]{3}|[AQA]\*?) *')

# Other test regex
# re_course  = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)\s+(\#?([0-9]{3})\*?).*')
# re_course  = re.compile(r'([A-Z]{2}[0-9]{3})\s+(.*)\s+(\#?[0-9]{3}\*?)\s+(\#?[0-9]{3}\*?)')

<br>

Loop throught the lines of response content from CAO

Copy of cao.csv orginal copy shows circa 949 lines

***

In [None]:
# The file path for the csv file.
path = 'data/cao2021_csv_' + nowstr + '.csv'

# Keep track of how many courses we process.
no_lines = 0

# Open the csv file for writing.
with open(path, 'w') as f:
    # Loop through lines of the response.
    for line in resp.iter_lines():
        # Decode the line, using the wrong encoding!
        dline = line.decode('cp1252')
        # Match only the lines representing courses.
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # The course code.
            course_code = dline[:5]
            # The course title.
            course_title = dline[7:57]
            # Round one points.
            course_points = re.split(' +', dline[60:])
            if len(course_points) != 2:
                course_points = course_points[:2]
            # Join the fields using a comma.
            linesplit = [course_code, course_title, course_points[0], course_points[1]]
            # Rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

# 2020 Points

https://www.cao.ie/index.php?page=points&p=2020

In [None]:
# Save file path for the orginal data
path = 'data/cao2020_' + nowstr + '.xlsx'

In [None]:
# get data from cao and save to path
urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', path)


## Load the spreadsheet using pandas

***

In [None]:
# Download and parse the excel spreadsheet.
df = pd.read_excel('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', skiprows=10)

In [None]:
# Spot check on the excel file
df.iloc[800]

In [None]:
# Spot check the last row.
df.iloc[-1]

In [None]:
# first five rows of dataframe
df.iloc[0:5]

In [None]:
# 1st, 4th, 7th, 25th row + 1st 8th 10th columns.
df.iloc[[0,3,6,24], [0,7,9]] 

In [None]:
# Create a file path for the pandas data.
path = 'data/cao2020_' + nowstr + '.csv'

In [None]:
# Save pandas data frame to disk.
df.to_csv(path)

# 2019 Points

https://www.cao.ie/index.php?page=points&p=2019

***

In [None]:
# Save file path for the orginal data
path = 'data/cao2019_' + nowstr + '.pdf'

# get data from cao and save to path
urlrq.urlretrieve('http://www2.cao.ie/points/lvl8_19.pdf', path)


#df2019 = pd.read_csv('data/cao2019_20211029_113930_edited.csv', sep='\t')

## Load the pdf using pandas

***

In [None]:
# Download and parse the pdf with tabula module
# 
# import module tabula
# Documentation
# https://pypi.org/project/tabula-py/
df = tb.read_pdf('http://www2.cao.ie/points/lvl8_19.pdf', pages='all')

df

In [None]:
# Create csv path
csvpath = 'data/cao2019_' + nowstr + '.csv'

# Convert pdf to csv
tb.convert_into( path , csvpath, output_format="csv", pages='all')

## Read csv into Pandas Dataframe

Clean data and remove colleges

- There is [965 rows x 4 columns] in the orignal dataframe.
- There is 35 Colleges
- All Colleges have NaN included
- Remove all colleges from dataframe 

In [None]:
# Read csv as a pandas dataframe
df2019 = pd.read_csv(csvpath, sep=',')

print(df2019)

In [None]:
# All row with colleges have NAN 
# Find all rows that have NAN as a value
# https://www.geeksforgeeks.org/working-with-missing-data-in-pandas/

# creating bool series True for NaN values 
bool_series = pd.isnull(df2019["Course Code"]) 
    
# filtering data 
# displaying data only with Course Code = NaN 
df2019[bool_series]


In [None]:
# Count how many rows have NaN
# https://datatofish.com/count-nan-pandas-dataframe/
count_nan = df2019['Course Code'].isna().sum()

print ('Count of NaN: ' + str(count_nan))

In [None]:
# Drop rows with NAN
# https://www.kite.com/python/answers/how-to-drop-empty-rows-from-a-pandas-dataframe-in-python

df2019.dropna(subset = ['Course Code'], inplace=True)

print(df2019)

# End

In [None]:
df = tb.read_pdf('http://www2.cao.ie/points/lvl8_10.pdf', pages='all')

df