# Fundamentals of Data Analysis, Winter 21/22
- Author: Brendan Tunney
- ID - G00270683

## Importing the CAO data


In [53]:
#Importing libraries

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re # regular expressions {1}
import requests as rq #{1} # To retrieve HTTP Data
import urllib.request as urlrq # Importing pythons URL module
import csv # importing csv module to read, write csv
from unicodedata import normalize # Imported to use normalised string values.
# Dates and times.
import datetime as dt # Importing date/time module


In [2]:
resp = rq.get('http://www2.cao.ie/points/l8.php') # Getting CAO points data for 2021

resp # checking response. 200 = succesful

<Response [200]>

In [3]:
now = dt.datetime.now() # Get the current date and time.

nowstr = now.strftime('%Y%m%d_%H%M%S') # # Format date and time as a string.

pathhtml = 'resp_' + nowstr + '.html' # Creating file path

In [4]:
# If we decode per server as ISO-8859-1, it will be noted that one line uses \x96 - not defined in ISO-8859-1. CP1252 is used instead.

original_encoding = resp.encoding

resp.encoding = 'cp1252'


In [5]:
re_course = re.compile(r'([A-Z]{2}[0-9]{3})(.*)') # Compile the reg. expressions.

In [6]:
with open(pathhtml, 'w') as f:
    f.write(resp.text)

In [7]:
path2021 = 'resp_' + nowstr + '.csv'

In [8]:
no_lines = 0 # Counting # of lines processed

with open(path2021, 'w') as f:                                            # Open csv file to write
   
    f.write(','.join(['code', 'title', 'pointsR1', 'pointsR2']) + '\n')   # Write file header
    for line in resp.iter_lines():                                        # Loop thorugh lines of response data
        dline = line.decode('cp1252')                                     # Decode line
        if re_course.fullmatch(dline):                                    # Match only lines with a course
            no_lines = no_lines + 1                                       # If matched,add '1' to the counter
            course_code = dline[:5]                                       # Course code (5 characters)
            course_title = dline[7:57].strip()                            # Course title and stripping out blanks
            course_points = re.split(' +', dline[60:])                    # First round points accounting for spaces
            if len(course_points) != 2:                                   # Second round points - if applicable.
                course_points = course_points[:2]
            linesplit = [course_code, course_title, course_points[0], course_points[1]] # Creating header array.
            f.write(','.join(linesplit) + '\n')                           # Rejoining split values with comma seperation and adding new line for each course

print(f"Total number of lines is {no_lines}.")                            # Number of processed lines

Total number of lines is 949.


In [9]:
df2021 = pd.read_csv(path2021, encoding='cp1252')

In [None]:
print (df2021)

In [69]:
def cao_data():
    
    url = 'http://www2.cao.ie/points/CAOPointsCharts2020.xlsx' # URL for 2020 points
    
    df = pd.read_excel(url, skiprows = 10)                     # read in excel file, skipping first 10 rows that do not contain course data    

    # Renaming columns to match 2021 file

    df.rename(columns={"COURSE CODE2": "code", "R1 POINTS": "pointsR1", "R2 POINTS": "pointsR2", "COURSE TITLE": "title"}, inplace=True)

    new_col_order = ['code', 'title', 'pointsR1', 'pointsR2', 'LEVEL']   # New column order list
    f = df.reindex(columns = new_col_order)                              # Re-ordering columns

    # Removing redundant columns

    df.drop(['CATEGORY (i.e.ISCED description)', 'R1 Random *', 'R2 Random*','EOS','EOS Random *', 'EOS Mid-point', 'HEI', 'Test/Interview #', 'avp', 'v', 'Column1', 'Column2','Column3', 'Column4', 'Column5', 'Column6', 'Column7', 'Column8'], axis = 1)


print(cao_data)   

<function cao_data at 0x0000024B540651F0>
