# Loading CAO points information into a pandas data frame, using CAO website

http://www.cao.ie/index.php?page=points&p=2021

***

## INTRODUCTION

## WHAT IS CAO

### LOADING THE DATA

In [1]:
# HTTP request
import requests as rq
# Regular expressions
import re
# Dates and time
import datetime as dt
# Data frames
import pandas as pd
# For downloading
import urllib.request as urlrq
# PDF
import camelot

## LEVEL 8 POINTS 2019, 2020, 2021
### ROUND 1 AND ROUND 2

In [2]:
# Current date and time
now = dt.datetime.now()
# Format as a string
nowstr = (now.strftime("%Y%m%d_%H%M%S"))

In [3]:
# Regular expression
re_course = re.compile(r"([A-Z]{2}[0-9]{3})(.*)")

In [4]:
# Defining a dictionary
    # Using website archive to access the wanted data (2019, 2020)
years_dict = {
    "2019": [("data/cao2019_" +  nowstr), "https://web.archive.org/web/20191019135815/http://www2.cao.ie:80/points/l8.php"],
    "2020": [("data/cao2020_" +  nowstr), "https://web.archive.org/web/20201108133105/http://www2.cao.ie/points/l8.php"],
    "2021": [("data/cao2021_" +  nowstr), "http://www2.cao.ie/points/l8.php"]
}

In [5]:
# Loop through the (dict) years
for year, content in years_dict.items():
    # Fetch the CAO points URL
    rq.get (content[1])
    resp = rq.get (content[1])
    # The server uses the wrong encoding
        # Change to "cp1252"
    resp.encoding = "cp1252"
    # Check if OK:
        #Response [200] means OK
    print (resp)
    # Save the original html file
    with open(content[0] + ".html", "w") as f:
        f.write(resp.text)
    # Keep track of how many courses we process
    no_lines = 0
    # Iterating through the lines
    resp.iter_lines()
    resps = resp.iter_lines()
    # Open the csv file for writing.
    with open(content[0] + ".csv", "w") as f:
        # Write a header row.
        f.write(','.join(["CODE", "TITLE", "R1_POINTS", "R2_POINTS"]) + "\n")
        # Loop through lines of the response.
        for line in resps:
            # Decode the line, using the wrong encoding
            dline = line.decode("cp1252")
            # Match only the lines representing courses
            if re_course.fullmatch(dline):
                # Add one to the lines counter
                no_lines = no_lines + 1
                # The course code
                course_code = dline[:5]
                # The course title
                course_title = dline[7:57]
                # Round one points
                course_points = re.split(' +', dline[60:])
                if len(course_points) != 2:
                    course_points = course_points[:2]
                # Join the fields using a comma
                linesplit = [course_code, course_title, course_points[0], course_points[1]]
                # Rejoin the substrings with commas in between
                f.write(",".join(linesplit) + "\n")   
    # Print the total number of processed lines
    print(f"Total number of lines is {no_lines}.")

<Response [200]>
Total number of lines is 930.
<Response [200]>
Total number of lines is 961.
<Response [200]>
Total number of lines is 949.


# READ AND JOIN ALL TABLES - TESTING

In [6]:
df2019 = pd.read_csv(("data/cao2019_" +  nowstr + ".csv"), encoding='cp1252')
#df2019

In [7]:
df2020 = pd.read_csv(("data/cao2020_" +  nowstr + ".csv"), encoding='cp1252')
#df2020

In [8]:
df2021 = pd.read_csv(("data/cao2021_" +  nowstr + ".csv"), encoding='cp1252')
#df2021

In [9]:
courses2019 = df2019[["CODE", "TITLE"]]
#courses2019

In [10]:
courses2020 = df2020[["CODE", "TITLE"]]
#courses2020

In [11]:
courses2021 = df2021[["CODE", "TITLE"]]
#courses2021

In [12]:
allcourses = pd.concat([courses2019, courses2020, courses2021])
#allcourses

In [13]:
allcourses.drop_duplicates(subset=["CODE"], inplace=True, ignore_index=True)

In [14]:
#allcourses

In [15]:
df2019.columns = ["CODE","TITLE", "R1_POINTS_2019", "R2_POINTS_2019"]
df2019.set_index("CODE", inplace=True)
#df2019

In [16]:
df2020.columns = ["CODE","TITLE", "R1_POINTS_2020", "R2_POINTS_2020"]
df2020.set_index("CODE", inplace=True)
#df2020

In [17]:
df2021.columns = ["CODE","TITLE", "R1_POINTS_2021", "R2_POINTS_2021"]
df2021.set_index("CODE", inplace=True)
#df2021

In [18]:
allcourses.set_index("CODE", inplace=True)
allcourses = allcourses.join(df2019[["R1_POINTS_2019", "R2_POINTS_2019"]])
#allcourses

In [19]:
allcourses = allcourses.join(df2020[["R1_POINTS_2020", "R2_POINTS_2020"]])
#allcourses

In [20]:
allcourses = allcourses.join(df2021[["R1_POINTS_2021", "R2_POINTS_2021"]])
#allcourses

In [21]:
allcourses.sort_values("CODE", inplace = True)
allcourses.to_csv ("all.csv")
allcourses

Unnamed: 0_level_0,TITLE,R1_POINTS_2019,R2_POINTS_2019,R1_POINTS_2020,R2_POINTS_2020,R1_POINTS_2021,R2_POINTS_2021
CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AC120,International Business ...,234,234,209,,294,294
AC137,Liberal Arts ...,275,252,252,,271,270
AD101,First Year Art &amp; Design (Common Entry) ...,,#+Matric,,#+matric,#554,
AD102,Graphic Design and Moving Image Design ...,#+Matric,,#+matric,,#538,
AD103,Textile &amp; Surface Design and Jewellery &am...,cts,#+Matric,cts,(,#505,
...,...,...,...,...,...,...,...
WD211,Creative Computing ...,275,,271,,270,
WD212,Recreation and Sport Management ...,274,,270,,262,
WD230,Mechanical and Manufacturing Engineering ...,273,,253,,230,230
WD231,Early Childhood Care and Education ...,,,,,266,


# END TEST

## LEVEL 8 POINTS 2020, 2019
### EOS AND MED
http://www.cao.ie/index.php?page=points&p=2020

### 2020 .xlsx

In [22]:
# Create a file path for the original data
path = ("data/cao2020_eos" +  nowstr + ".xlsx")

In [23]:
urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', path)

('data/cao2020_eos20211115_100905.xlsx',
 <http.client.HTTPMessage at 0x212c770e130>)

In [24]:
# Download and parse the excel spreadsheet
df = pd.read_excel("http://www2.cao.ie/points/CAOPointsCharts2020.xlsx", skiprows = 10)
#df

In [25]:
# Spotcheck random row
#df.iloc[753]

In [26]:
# Spotcheck last row
#df.iloc[-1]

In [27]:
# Create a file path for the pandas data
path = ("data/cao2020_eos" +  nowstr + ".csv")

In [28]:
# Save pandas data frame to disk
df.to_csv(path)

***

### 2019 .pdf

### GET THE ORIGINAL

In [29]:
path = 'data/cao2019_eos' + nowstr + '.pdf'

In [30]:
resp_pdf = rq.get("http://www2.cao.ie/points/lvl8_19.pdf")
resp_pdf

<Response [200]>

In [31]:
with open(path, 'wb') as f:
    f.write(resp_pdf.content)

### READ THE TABLE FROM PDF

In [32]:
file = path
tables = camelot.read_pdf(file, pages = "all", flavor = "lattice")

In [33]:
print ("Tables:", tables.n)

Tables: 18


https://stackoverflow.com/questions/52383287/concatenate-dataframes-in-a-for-loop

In [34]:
table_total = []
for x in range (0,18):
    df = tables[x].df
    table_total.append(df)
    
table = pd.concat(table_total)
table.to_csv("data/cao2019_eos" +  nowstr + ".csv", index = False) 

### DATA COMPARISON

# CONCLUSION

***

## REFERENCES

## LINKS TO CHECK