# Loading CAO points information into a pandas data frame, using CAO website

http://www.cao.ie/index.php?page=points&p=2021

***

## INTRODUCTION

## WHAT IS CAO

### LOADING THE DATA

In [1]:
# HTTP request
import requests as rq
# Regular expressions
import re
# Dates and time
import datetime as dt

'''
# Data frames
import pandas as pd
# For downloading
import urllib.request as urlrq
'''

'\n# Data frames\nimport pandas as pd\n# For downloading\nimport urllib.request as urlrq\n'

## LEVEL 8 POINTS 2019, 2020, 2021

In [2]:
resp_2019 = rq.get ("https://web.archive.org/web/20191019135815/http://www2.cao.ie:80/points/l8.php")
resp_2020 = rq.get ("https://web.archive.org/web/20201108133105/http://www2.cao.ie/points/l8.php")
resp_2021 = rq.get ("http://www2.cao.ie/points/l8.php")
print (resp_2019)
print (resp_2020)
print (resp_2021)
#200 means ok

<Response [200]>
<Response [200]>
<Response [200]>


## SAVE THE ORIGINAL DATA

In [3]:
# Current date and time
now = dt.datetime.now()
# Format as a string
nowstr = (now.strftime("%Y%m%d_%H%M%S"))

In [4]:
# Create a file paths for the original data
path_2019 = ("data/cao2019_" +  nowstr + ".html")
path_2020 = ("data/cao2020_" +  nowstr + ".html")
path_2021 = ("data/cao2021_" +  nowstr + ".html")

In [5]:
# Fixing the wrong server encoding
# original_encoding = resp_2019.encoding
# Change to cp1252
resp_2019.encoding = "cp1252"
resp_2020.encoding = "cp1252"
resp_2021.encoding = "cp1252"

In [6]:
# Save the original html file
with open(path_2019, "w") as f:
    f.write(resp_2019.text)
with open(path_2020, "w") as f:
    f.write(resp_2020.text)
with open(path_2021, "w") as f:
    f.write(resp_2021.text)

## REGULAR EXPRESSIONS

In [7]:
re_course = re.compile(r"([A-Z]{2}[0-9]{3})(.*)")

## LOOP THROUGH THE LINES

In [8]:
resps = [(resp_2019.iter_lines()), (resp_2020.iter_lines()), (resp_2021.iter_lines())]
# The file paths for the csv files
path = [("data/cao2019_csv_" +  nowstr + ".csv"), ("data/cao2020_csv_" +  nowstr + ".csv"), ("data/cao2021_csv_" +  nowstr + ".csv")]

def create_csv (path, resps):
    no_lines = 0
    # Open the csv file for writing.
    with open(path, 'w') as f:        
        # Loop through lines of the response.
        for line in resps:
            # Decode the line, using the wrong encoding!
            dline = line.decode('cp1252')
            # Match only the lines representing courses.
            if re_course.fullmatch(dline):
                # Add one to the lines counter.
                no_lines = no_lines + 1
                # The course code.
                course_code = dline[:5]
                # The course title.
                course_title = dline[7:57]
                # Round one points.
                course_points = re.split(' +', dline[60:])
                if len(course_points) != 2:
                    course_points = course_points[:2]
                # Join the fields using a comma.
                linesplit = [course_code, course_title, course_points[0], course_points[1]]
                # Rejoin the substrings with commas in between.
                f.write(','.join(linesplit) + '\n')   
    # Print the total number of processed lines.
    print(f"Total number of lines is {no_lines}.")

In [9]:
create_csv ("data/cao2019_csv_" +  nowstr + ".csv",  (resp_2019.iter_lines()))
create_csv ("data/cao2020_csv_" +  nowstr + ".csv",  (resp_2020.iter_lines()))
create_csv ("data/cao2021_csv_" +  nowstr + ".csv",  (resp_2021.iter_lines()))

Total number of lines is 930.
Total number of lines is 961.
Total number of lines is 949.


## 2020 POINTS
http://www.cao.ie/index.php?page=points&p=2020

In [10]:
'''
# Create a file path for the original data
path = ("data/cao2020_" +  nowstr + ".xlsx")

urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', path)

# Download and parse the excel spreadsheet
df = pd.read_excel("http://www2.cao.ie/points/CAOPointsCharts2020.xlsx", skiprows = 10)

df

# Spotcheck random row
df.iloc[753]

# Spotcheck last row
df.iloc[-1]

# Create a file path for the pandas data
path = ("data/cao2020_" +  nowstr + ".csv")

# Save pandas data frame to disk
df.to_csv(path)
'''

'\n# Create a file path for the original data\npath = ("data/cao2020_" +  nowstr + ".xlsx")\n\nurlrq.urlretrieve(\'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx\', path)\n\n# Download and parse the excel spreadsheet\ndf = pd.read_excel("http://www2.cao.ie/points/CAOPointsCharts2020.xlsx", skiprows = 10)\n\ndf\n\n# Spotcheck random row\ndf.iloc[753]\n\n# Spotcheck last row\ndf.iloc[-1]\n\n# Create a file path for the pandas data\npath = ("data/cao2020_" +  nowstr + ".csv")\n\n# Save pandas data frame to disk\ndf.to_csv(path)\n'

#### Regular expressions

### CAO DATA 2019, 2020, 2021

### DATA COMPARISON

# CONCLUSION

***

## REFERENCES

## LINKS TO CHECK