# Loading CAO points information into a pandas data frame, using CAO website

http://www.cao.ie/index.php?page=points&p=2021

***

## INTRODUCTION

## WHAT IS CAO

### LOADING THE DATA

In [1]:
# HTTP request
import requests as rq
# Regular expressions
import re
# Dates and time
import datetime as dt

'''
# Data frames
import pandas as pd
# For downloading
import urllib.request as urlrq
'''

'\n# Data frames\nimport pandas as pd\n# For downloading\nimport urllib.request as urlrq\n'

## LEVEL 8 POINTS 2019, 2020, 2021

In [2]:
# Current date and time
now = dt.datetime.now()
# Format as a string
nowstr = (now.strftime("%Y%m%d_%H%M%S"))

In [3]:
# Regular expression
re_course = re.compile(r"([A-Z]{2}[0-9]{3})(.*)")

In [4]:
# Defining a dictionary
years_dict = {
    "2019": [("data/cao2019_" +  nowstr + ".html"), "https://web.archive.org/web/20191019135815/http://www2.cao.ie:80/points/l8.php", ("data/cao2019_csv_" +  nowstr + ".csv")],
    "2020": [("data/cao2020_" +  nowstr + ".html"), "https://web.archive.org/web/20201108133105/http://www2.cao.ie/points/l8.php", ("data/cao2020_csv_" +  nowstr + ".csv")],
    "2021": [("data/cao2021_" +  nowstr + ".html"), "http://www2.cao.ie/points/l8.php", ("data/cao2021_csv_" +  nowstr + ".csv")]
}

In [5]:
# Loop through the (dict) years
for year, content in years_dict.items():
    # Fetch the CAO points URL
    rq.get (content[1])
    resp = rq.get (content[1])
    # The server uses the wrong encoding
        # Change to "cp1252"
    resp.encoding = "cp1252"
    # Check if OK:
        #Response [200] means OK
    print (resp)
    # Save the original html file
    with open(content[0], "w") as f:
        f.write(resp.text)
    # Keep track of how many courses we process
    no_lines = 0
    # Iterating through the lines
    resp.iter_lines()
    resps = resp.iter_lines()
    # Open the csv file for writing.
    with open(content[2], 'w') as f:        
        # Loop through lines of the response.
        for line in resps:
            # Decode the line, using the wrong encoding!
            dline = line.decode('cp1252')
            # Match only the lines representing courses.
            if re_course.fullmatch(dline):
                # Add one to the lines counter.
                no_lines = no_lines + 1
                # The course code.
                course_code = dline[:5]
                # The course title.
                course_title = dline[7:57]
                # Round one points.
                course_points = re.split(' +', dline[60:])
                if len(course_points) != 2:
                    course_points = course_points[:2]
                # Join the fields using a comma.
                linesplit = [course_code, course_title, course_points[0], course_points[1]]
                # Rejoin the substrings with commas in between.
                f.write(','.join(linesplit) + '\n')   
    # Print the total number of processed lines.
    print(f"Total number of lines is {no_lines}.")

<Response [200]>
Total number of lines is 930.
<Response [200]>
Total number of lines is 961.
<Response [200]>
Total number of lines is 949.


## 2020 POINTS
http://www.cao.ie/index.php?page=points&p=2020

In [6]:
'''
# Create a file path for the original data
path = ("data/cao2020_" +  nowstr + ".xlsx")

urlrq.urlretrieve('http://www2.cao.ie/points/CAOPointsCharts2020.xlsx', path)

# Download and parse the excel spreadsheet
df = pd.read_excel("http://www2.cao.ie/points/CAOPointsCharts2020.xlsx", skiprows = 10)

df

# Spotcheck random row
df.iloc[753]

# Spotcheck last row
df.iloc[-1]

# Create a file path for the pandas data
path = ("data/cao2020_" +  nowstr + ".csv")

# Save pandas data frame to disk
df.to_csv(path)
'''

'\n# Create a file path for the original data\npath = ("data/cao2020_" +  nowstr + ".xlsx")\n\nurlrq.urlretrieve(\'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx\', path)\n\n# Download and parse the excel spreadsheet\ndf = pd.read_excel("http://www2.cao.ie/points/CAOPointsCharts2020.xlsx", skiprows = 10)\n\ndf\n\n# Spotcheck random row\ndf.iloc[753]\n\n# Spotcheck last row\ndf.iloc[-1]\n\n# Create a file path for the pandas data\npath = ("data/cao2020_" +  nowstr + ".csv")\n\n# Save pandas data frame to disk\ndf.to_csv(path)\n'

#### Regular expressions

### CAO DATA 2019, 2020, 2021

### DATA COMPARISON

# CONCLUSION

***

## REFERENCES

## LINKS TO CHECK