# Fundamentals of Data Analysis, Winter 21/22
- Author: Brendan Tunney
- ID - G00270683

## Importing the CAO data


In [1]:
#Importing libraries

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re # regular expressions {1}
import requests as rq #{1} # To retrieve HTTP Data
import urllib.request as urlrq # Importing pythons URL module
import csv # importing csv module to read, write csv
from unicodedata import normalize # Imported to use normalised string values.
# Dates and times.
import datetime as dt # Importing date/time module


In [2]:
resp = rq.get('http://www2.cao.ie/points/l8.php') # Getting CAO points data for 2021

resp # checking response. 200 = succesful

<Response [200]>

In [3]:
now = dt.datetime.now() # Get the current date and time.

nowstr = now.strftime('%Y%m%d_%H%M%S') # # Format date and time as a string.

pathhtml = 'resp_' + nowstr + '.html' # Creating file path

In [4]:
# If we decode per server as ISO-8859-1, it will be noted that one line uses \x96 - not defined in ISO-8859-1. CP1252 is used instead.

original_encoding = resp.encoding

resp.encoding = 'cp1252'


In [5]:
re_course = re.compile(r'([A-Z]{2}[0-9]{3})(.*)') # Compile the reg. expressions.

In [6]:
with open(pathhtml, 'w') as f:
    f.write(resp.text)

In [7]:
path2021 = 'resp_' + nowstr + '.csv'

In [14]:
no_lines = 0 # Counting # of lines processed

with open(path2021, 'w') as f:                                            # Open csv file to write
   
    f.write(','.join(['code', 'title', 'pointsR1', 'pointsR2']) + '\n')   # Write file header
    for line in resp.iter_lines():                                        # Loop thorugh lines of response data
        dline = line.decode('cp1252')                                     # Decode line
        if re_course.fullmatch(dline):                                    # Match only lines with a course
            no_lines = no_lines + 1                                       # If matched,add '1' to the counter
            course_code = dline[:5]                                       # Course code (5 characters)
            course_title = dline[7:57].strip()                            # Course title and stripping out blanks
            course_points = re.split(' +', dline[60:])                    # First round points accounting for spaces
            if len(course_points) != 2:                                   # Second round points - if applicable.
                course_points = course_points[:2]
            linesplit = [course_code, course_title, course_points[0], course_points[1]] # Creating header array.
            f.write(','.join(linesplit) + '\n')                           # Rejoining split values with comma seperation and adding new line for each course

print(f"Total number of lines is {no_lines}.")                            # Number of processed lines

Total number of lines is 949.


In [16]:
df2021 = pd.read_csv(path2021, encoding='cp1252')

In [17]:
print (df2021)

      code                                              title pointsR1  \
0    AL801     Software Design for Virtual Reality and Gaming      300   
1    AL802  Software Design in Artificial Intelligence for...      313   
2    AL803  Software Design for Mobile Apps and Connected ...      350   
3    AL805    Computer Engineering for Network Infrastructure      321   
4    AL810                                 Quantity Surveying      328   
..     ...                                                ...      ...   
944  WD211                                 Creative Computing      270   
945  WD212                    Recreation and Sport Management      262   
946  WD230           Mechanical and Manufacturing Engineering      230   
947  WD231                 Early Childhood Care and Education      266   
948  WD232                       Business Information Systems      261   

    pointsR2  
0        NaN  
1        NaN  
2        NaN  
3        NaN  
4        NaN  
..       ...  
944   

Total number of lines is 0.
