# Module Information Scraper
This code is to scrape assessment details from UCD module-by-module. From there, we can find out how vulnerable UCD is to ChatGPT and other similar AI helpers. First we will need to import some packages to do this.

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
import pathlib as Path
import html5lib
import json

Next we will need to set the path to the datasets that we will use. This currently pulls in a specific file, that of MODULES.csv, which has all collected module information for the school of Engineering and Architecture. However, this could easily be changed to analyze sub-schools or other schools.

In [2]:
dir_raw=Path.Path("Datasets")

moduleCodes= dir_raw / "MODULES.csv"

modules=pd.read_csv(moduleCodes)

modules

Unnamed: 0,Code,Module,School
0,DSCY10060,"Energy, Climate Change & Policy",EEE
1,EEEN10010,Electronic and Electrical Engineering I,EEE
2,EEEN10020,Robotics Design Project,EEE
3,EEEN20010,Computer Engineering,EEE
4,EEEN20020,Electrical and Electronic Circuits,EEE
...,...,...,...
519,MEEN50050,Creative Thinking & Innovation,MME
520,MEEN50060,Research Techniques Space Eng,MME
521,MEEN50070,Industrial Research I,MME
522,MEEN50080,Industrial Research II,MME


The module descriptor scraper pulls all module descriptor information from the UCD module website. This includes information such as who runs the module, and importantly for our analysis, the number of credits for each module.

In [27]:
def module_descriptor_scraper(url, code, codeList=None, level=None, school=None):
    #url= "https://hub.ucd.ie/usis/!W_HU_MENU.P_PUBLISH?p_tag=MODULE&MODULE=" + modules.iloc[0]["Code"]

    request=requests.get(url)
    soup=BeautifulSoup(request.content, 'html.parser')

#print(soup)
#Create all_h1_tags as empty list
    descriptor_list={}

# Set all_h1_tags to all h1 tags of the soup
    for element in soup.select('dl'):
        credit_list=element.text
        for items in zip(soup.select('dt'), soup.select('dd')):
            #print(items[0].text + " " + items[1].text)
            descriptor_list[items[0].text]=items[1].text
        #descriptor_item.append(items[1])
        
#print("%d , %d" %(len(descriptor_type),len(descriptor_item)))

    
    module_descriptor=pd.Series(descriptor_list)
    module_descriptor["Credits:"]=pd.to_numeric(module_descriptor["Credits:"], errors='coerce')
    #print(module_descriptor)
#print(element)

    filtered=False
    
    #If filters exist, check that the module is not filtered out
    if (level != None):
        filtered= (pd.to_numeric(module_descriptor["Level:"].split('(')[0], errors='ignore') != level)
                   
    #If it wasn't filtered out by level, check if it is filtered out by school
    if (filtered == False) and (school != None):
        filtered = (module_descriptor["School:"] != school)
                   

    if (filtered ==False) and (codeList != None):
        filtered=any(code==allowed_codes  for allowed_codes in codeList)
        if filtered:
            print("%s is in list : %s" %(code, filtered))

    return module_descriptor, filtered

In [24]:
#This asserts that the filter works correctly
def assert_filtered(module_descriptors, level=None, school=None):
    all_descriptors=pd.concat(module_descriptors)
    
    if level !=None:
        assert (all_descriptors["Level:"].nunique() == 1)
    if school != None:
        assert (all_descriptors["School:"].nunique() == 1)
        
    print(all_descriptors["School:"].unique())
    return all_descriptors["School:"].nunique(), all_descriptors["Level:"].nunique()

The below code collects all module assessment and module descriptor information into two lists. It also creates a "Scaled % of Final Grade" column in the asssessment table. This weights the assessment based on the number of credits the module has overall. In this way, the median and normal amount of credits, 5.0, has assessments weighting that add up to 100%. Those above and below are given assessment weightings that scale with how much more or less they are worth then a normal module - a 10 credit module will have assessments that add up to 200%, because they are worth twice the amount as a normal module.

Error module details are stored for inspection later, to see why they occurred. The code continues on even if errors occur, after having stored these details it simply proceeds to the next module.

In [25]:
def save_module_files(module_assessments, module_descriptors, level, school):
    #The directory to save outputs to
    dir_output=Path.Path("ModuleInformation")
    dir_output.mkdir(parents=True, exist_ok=True)
    
    
    #filenames={module_assessments: "assessments", module_descriptors:"descriptors"}
    subdirectory=""
    #Save the file in its desired format
    if level != None:
        subdirectory+="Level=%d" %(level)
        
    if school != None:
        subdirectory+="_School="+school.replace(" ", "-")
    
    #if the modules have been filtered, and thus belong in a sub directory, make that directory
    if len(subdirectory) > 0:
        dir_output=dir_output / subdirectory
        dir_output.mkdir(parents=True, exist_ok=True)
        
    #Save our two module detail files
    with open(dir_output / "assessments.json", 'w') as outfile:
        if len(module_assessments) > 2:
            module_assessments=pd.concat(module_assessments, ignore_index=True)
            #print(module_assessments)
        print("saving to %s" % dir_output)
        outfile.write(module_assessments.to_json())
        
    with open(dir_output / "descriptors.json", 'w') as outfile:
        print("saving to %s" % dir_output)
        if len(module_descriptors) > 2:
            module_descriptors=pd.DataFrame(module_descriptors)
        outfile.write(module_descriptors.to_json())

In [28]:
#This functiom will allow school and year functions to be placed on it
def collector(codeList=None, level=None, school=None):
    #This stores module information
    module_assessments=[]
    module_descriptors=[]

    #This stores error module information
    error_modules=[]
    error_module_descriptors=[]

    for i in modules.iloc:
        print(".",end="")
        url= "https://hub.ucd.ie/usis/!W_HU_MENU.P_PUBLISH?p_tag=MODULE&MODULE=" + i["Code"]

        #Get the module descriptor
        descriptor, filtered=module_descriptor_scraper(url, i["Code"], codeList=codeList, level=level, school=school)
        #If the module is in violation of the filters, continue to the next without saving
        if filtered==True:
            continue
        #Use pandas to read in the asssessment html tables too
        table=pd.read_html(url, match="Description")
        
        #print(descriptor)
        #Get the first table, and turn it into a dataframe
        df=pd.DataFrame(table[0])
        df["Assessment Type"] = df['Description'].str.split(':').str[0]
        try:
            df["Scaled % of Final Grade"]= df['% of Final Grade'].apply(lambda x: x * (descriptor["Credits:"]/5.0))
        except:
            print("\nERROR MODULE DETECTED.")
            #print(df["% of Final Grade"])
            #print(descriptor["Credits:"])
            print("Module may need to be inspected, saving information as an error module and continuing without it")
        
            error_modules.append(df)
            error_module_descriptors.append(descriptor)
            #Continue after noting and saving the error
            continue
        
        #Append the dataframe to the module assessments list
        module_assessments.append(df)
        module_descriptors.append(descriptor)
    
    #This asserts that the filters were properly imposed, if imposed at all
    num_schools, num_levels=assert_filtered(module_descriptors, level, school)

    print("\nFINISHED, SCRAPED DETAILS ON %d MODULES, OVER %d SCHOOLS AND %d LEVELS" \
          %(len(module_assessments), num_schools, num_levels))
    
    #Save the output files
    save_module_files(module_assessments, module_descriptors, level, school)
    
    return module_assessments, module_descriptors, error_modules, error_module_descriptors



In [None]:
#Run the above function in its base form
module_assessments, module_descriptors, error_modules, error_module_descriptors=collector()

In [15]:
coreCodes= dir_raw / "UCD_EngArch_Mech_UG_Modules.xlsx"

coreModules=pd.read_excel(coreCodes, header=None)

coreList=coreModules.values.tolist()

print(coreList)

[['EEEN20020'], ['MEEN20010'], ['MEEN20020'], ['MEEN20050'], ['MEEN20030'], ['MEEN20040'], ['MEEN20060'], ['MEEN20070'], ['MEEN30030'], ['MEEN30040'], ['MEEN30090'], ['MEEN30100'], ['EEEN20090'], ['EEEN30150'], ['MEEN30010'], ['MEEN30020'], ['MEEN30140'], ['CHEN10040'], ['CVEN10040'], ['EEEN10010'], ['MEEN10030'], ['MEEN10050']]


In [None]:
module_assessments, module_descriptors, error_modules, error_module_descriptors=collector(codeList=coreList)

.......................................................................................................................
ERROR MODULE DETECTED.
Module may need to be inspected, saving information as an error module and continuing without it
...........................................................

In [7]:
#Run the collector function to only collect level 1 modules in the college of Engineering and Architecture
module_assessments, module_descriptors, error_modules, error_module_descriptors=collector(level=1)

............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................['Electrical & Electronic Eng' 'Civil Engineering'
 'Chem & Bioprocess Engineering' 'Biosystems & Food Engineering'
 'Architecture, Plan & Env Pol' 'Mechanical & Materials Eng']

FINISHED, SCRAPED DETAILS ON 32 MODULES, OVER 6 SCHOOLS AND 1 LEVELS
saving to ModuleInformation\Level=1
saving to ModuleInformation\Level=1


In [8]:
#Run the collector function to only collect level 1 modules in the college of Engineering and Architecture
module_assessments, module_descriptors, error_modules, error_module_descriptors=collector(level=2)

.................................................................................................................

KeyboardInterrupt: 

In [None]:
#Run the collector function to only collect level 1 modules in the college of Engineering and Architecture
module_assessments, module_descriptors, error_modules, error_module_descriptors=collector(level=3)

In [None]:
#Run the collector function to only collect level 1 modules in the college of Engineering and Architecture
module_assessments, module_descriptors, error_modules, error_module_descriptors=collector(level=4)

In [None]:
#Run the collector function to only collect level 1 modules in the college of Engineering and Architecture
module_assessments, module_descriptors, error_modules, error_module_descriptors=collector(level=5)

In [None]:
#Run the collector function to only collect level 1 modules in the college of Engineering and Architecture
module_assessments, module_descriptors, error_modules, error_module_descriptors=collector(school="Mechanical & Materials Eng")

In [None]:
module_assessments, module_descriptors, error_modules, error_module_descriptors=collector(school="Civil Engineering")

In [None]:
module_assessments, module_descriptors, error_modules, error_module_descriptors=collector(school="Chem & Bioprocess Engineering")

In [None]:
module_assessments, module_descriptors, error_modules, error_module_descriptors=collector(school="Biosystems & Food Engineering")

In [None]:
module_assessments, module_descriptors, error_modules, error_module_descriptors=collector(school="Architecture, Plan & Env Pol")

In [None]:
module_assessments, module_descriptors, error_modules, error_module_descriptors=collector(school="Electrical & Electronic Eng")

In [None]:
#Inspect the error modules just in case
for i, error in enumerate(zip(error_modules, error_module_descriptors)):
    print("********ERROR COUNT %d*********" %i)
    print(error[0])
    print(error[1])
    

Next we combine all the modules assessment information together, ready for later aggregation and analysis.

In [None]:
aggreg_modules=pd.concat(module_assessments)

aggreg_modules

In [None]:
assessments=aggreg_modules[["Assessment Type", "% of Final Grade"]]

#This is the dataset cleaning, so that they are ready for presentation
assessments["% of Final Grade"]=pd.to_numeric(assessments["% of Final Grade"], errors='coerce')
assessments=assessments.replace("Multiple Choice Questionnaire (Short)", "Multiple Choice Questionnaire")
assessments=assessments.dropna()
assessments["Assessment Type"]=assessments["Assessment Type"].astype("category")

assessments

In [None]:
assessments.describe()

In [None]:
#assessments=assessments.replace("Multiple Choice Questionnaire (Short)", "Multiple Choice Questionnaire")
assessments["Assessment Type"].value_counts()

In [None]:
assessments["Assessment Type"].nunique()

In [None]:
#assessments = assessments.drop("No yet recorded.")
assessment_average=assessments.groupby(by="Assessment Type", as_index=False).mean()

assessment_average

In [None]:
sorted_averages=assessment_average.sort_values("% of Final Grade", ascending=False)

sorted_averages

In [None]:
import matplotlib.pyplot as plt

fig, ax=plt.subplots( figsize=(15, 10))

#barWidth=0.25
colormap={"Essay": "gold", "Assignment": "gold", "Examination": "mediumseagreen", "Attendance": "mediumseagreen", \
          "Class Test":"mediumseagreen", "Continuous Assessment": "gold", "Fieldwork": "mediumseagreen", "Group Project": "gold",\
         "Journal": "gold", "Lab Report" : "mediumseagreen", "Multiple Choice Questionnaire (MCQ)": "mediumseagreen", \
          "Oral Examination" : "mediumseagreen", "Portfolio": "gold", "Practical Examination": "mediumseagreen",\
          "Presentation":"mediumseagreen", "Project": "gold", "Seminar": "mediumseagreen",\
          "Studio Examination": "mediumseagreen", "Assessments worth <2%": "mediumseagreen"}

# Make the plot
ax.bar(sorted_averages["Assessment Type"], sorted_averages["% of Final Grade"], edgecolor="grey", color=["mediumseagreen", "gold", \
"gold", "mediumseagreen", "mediumseagreen", "gold", "gold",  "mediumseagreen", "gold", "gold", "gold", "mediumseagreen",\
                                                                                       "mediumseagreen", "mediumseagreen", \
                                                "mediumseagreen", "gold", "mediumseagreen", "mediumseagreen"] )


#Set the options for the axes
ax.tick_params(axis='y', labelsize=16)
ax.tick_params(axis='x', rotation=90, labelsize=14)
#ax.legend(fontsize=20)
ax.set_xlabel("Assessment Type", fontsize=18)
ax.set_ylabel("% of Final Grade",fontsize=18)
#Add the date lines for context
ax.set_title("Average Module Percentage When Assessment Type is Used", fontsize=20)

In [None]:
assessment_total=assessments.groupby(by="Assessment Type", as_index=False).sum()
 
assessment_total

In [None]:
total=assessment_total["% of Final Grade"].sum()

assessment_total["% of Assessment"]=assessment_total["% of Final Grade"].apply(lambda x: x/total)
assessment_total["Assessment Type"]=assessment_total["Assessment Type"].cat.add_categories("Assessments worth <2%")
assessment_total.loc[assessment_total["% of Assessment"] < 0.02, "Assessment Type"] = "Assessments worth <2%"

assessment_total

In [None]:
assessment_total["Assessment Type"]= assessment_total["Assessment Type"].cat.remove_categories(["Fieldwork", "Journal", \
                                                                             "Attendance", "Oral Examination", \
                                                "Practical Examination", "Seminar", "Studio Examination",\
                                                                                             "Multiple Choice Questionnaire"])

assessment_total

In [None]:
filtered_asses=assessment_total.groupby(by="Assessment Type").sum()

pie_assessment=filtered_asses["% of Assessment"]

sorted_totals=pie_assessment.sort_values(ascending=False)

sorted_totals

In [None]:
fig, ax=plt.subplots(figsize=(15,12))

#exploded=[0.03, 0, 0.03, 0.03, 0, 0.03, 0.03, 0.03, 0, 0, 0]
ax.pie(sorted_totals, wedgeprops = { 'linewidth' : 0.5, 'edgecolor' : 'grey' }, \
       autopct="%1.1f%%", labels=sorted_totals.index, colors=["gold","mediumseagreen", \
"gold", "gold", "mediumseagreen",  "gold", "gold", "gold", "mediumseagreen",\
                                                              "mediumseagreen", "mediumseagreen", "mediumseagreen"])

#my_circle=ax.Circle( (0,0), 0.7, color='white')
#p=ax.gcf()

ax.set_title("Assessment type as a Percentage of Module Assessments", fontsize=20)