In [1356]:
import pandas as pd
import math
import io
from datetime import datetime
from alive_progress import alive_bar


### Read Data

In [1357]:
### Note: if using Google Colab, this must be set to the filename of the 
# file you will load. This is a quirk of Colab. 
filename = "2020-2022 Training Records.xlsx"

try:
  import google.colab
  from google.colab import files
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
  print("Colab")
  uploaded = files.upload()
  source = pd.read_excel(io.BytesIO(uploaded.get(filename)))
else:
  print("Not Colab")
  source = pd.read_excel(filename)

Not Colab


In [1358]:
USERNAME = "Employee Number"
COURSEID = "OFFERING"
COURSEVERSION = "COURSE_VERSION"
COMPDATE = "Completion Date"
STATUS = "Completion Status"
COURSETITLE = "Activity Name"
PASSFAIL = "Pass/Fail"
ATTSTATUS = "Attendance Status"

In [1359]:
FILTER = "HEAD"
SIZE = 1000

if FILTER == "RANDOM":
    source = source.sample(SIZE)
elif FILTER == "HEAD":
    source = source.head(SIZE)
elif FILTER == "TAIL":
    source = source.tail(SIZE)


In [1360]:
source.head(5)

Unnamed: 0,Employee Number,Employee Name,Waived Notes,Employee email,Job Title,Department Code,Department Name,MANAGER,Activity Type,OFFERING,Activity Name,COURSE_VERSION,Score,Attendance Status,Pass/Fail,Completion Status,Completion Date,Expiration Date,Note
0,10034,"LEITCH, BRIAN",,brian.leitch@cnl.ca,Contingent Worker,220.0,MATERIALS AND FUELS PERFORMANC,"HAQUE, Zia",Course,EMP-1037-Online (REV 0.2),Emergency Procedure Refresher (0.2),0.2,100.0,Attended,Pass,Completed,2022-03-08,2023-03-09,
1,10034,"LEITCH, BRIAN",,brian.leitch@cnl.ca,Contingent Worker,220.0,MATERIALS AND FUELS PERFORMANC,"HAQUE, Zia",Versional,EMP-1037-Online,Emergency Procedure Refresher,,100.0,Attended,Pass,Completed,2022-03-08,2023-03-09,
2,10083,"REIMER, Terry",,terry.reimer@cnl.ca,Facility Manager,454.0,WL SITE & NUCLEAR OPERATIONS,"FILLION, Dave",ILT Class,HR-1005,Virtual Mandatory Harassment and Violence Prev...,,,Attended,Pass,Completed,2021-11-22,NaT,
3,10083,"REIMER, Terry",,terry.reimer@cnl.ca,Facility Manager,454.0,WL SITE & NUCLEAR OPERATIONS,"FILLION, Dave",ILT Class,RP-G2M2B-WL,Group 2 - Module 2B WL,,100.0,Attended,Pass,Completed,2021-03-04,NaT,
4,10083,"REIMER, Terry",,terry.reimer@cnl.ca,Facility Manager,454.0,WL SITE & NUCLEAR OPERATIONS,"FILLION, Dave",ILT Class,WL-307,Job Safety Analysis,,,Cancelled,,,2021-04-13,NaT,


In [1361]:
print(source.shape[1])
source[["Completion Status", "Attendance Status", "Pass/Fail"]].groupby(["Pass/Fail","Completion Status", "Attendance Status"], dropna=False, as_index=False).size()

19


Unnamed: 0,Pass/Fail,Completion Status,Attendance Status,size
0,Fail,Completed,Attended,10
1,Pass,Completed,Attended,716
2,Pass,Completed,Waived,4
3,,Completed,Attended,183
4,,,Cancelled,83
5,,,No-show,4


### Update Mapped Course Names

In [1362]:
USE_COURSE_MAP = True
CMAP_ORIG = "OLD_CNAME"
CMAP_NEW = "NEW_CNAME"
MAP_PATH = "map.csv"

In [1363]:
def map_course(mapping_list, orig_df):
    new_df = orig_df
    for _, map_row in mapping_list.iterrows():
        old_course = map_row[CMAP_ORIG]
        new_course = map_row[CMAP_NEW]
        #new_df[[COURSEID]] = new_df[[COURSEID]].replace(old_course, new_course)
        new_df.replace(old_course, new_course, inplace=True)
        #print(old_course, new_course)
    return new_df

In [1364]:
if USE_COURSE_MAP:
    course_map = pd.read_csv(MAP_PATH)
    source = map_course(course_map, source)
source.head(5)

Unnamed: 0,Employee Number,Employee Name,Waived Notes,Employee email,Job Title,Department Code,Department Name,MANAGER,Activity Type,OFFERING,Activity Name,COURSE_VERSION,Score,Attendance Status,Pass/Fail,Completion Status,Completion Date,Expiration Date,Note
0,10034,"LEITCH, BRIAN",,brian.leitch@cnl.ca,Contingent Worker,220.0,MATERIALS AND FUELS PERFORMANC,"HAQUE, Zia",Course,EMP-1036,Emergency Procedure Refresher (0.2),0.2,100.0,Attended,Pass,Completed,2022-03-08,2023-03-09,
1,10034,"LEITCH, BRIAN",,brian.leitch@cnl.ca,Contingent Worker,220.0,MATERIALS AND FUELS PERFORMANC,"HAQUE, Zia",Versional,EMP-1036,Emergency Procedure Refresher,,100.0,Attended,Pass,Completed,2022-03-08,2023-03-09,
2,10083,"REIMER, Terry",,terry.reimer@cnl.ca,Facility Manager,454.0,WL SITE & NUCLEAR OPERATIONS,"FILLION, Dave",ILT Class,HR-1005,Virtual Mandatory Harassment and Violence Prev...,,,Attended,Pass,Completed,2021-11-22,NaT,
3,10083,"REIMER, Terry",,terry.reimer@cnl.ca,Facility Manager,454.0,WL SITE & NUCLEAR OPERATIONS,"FILLION, Dave",ILT Class,RP-G2M2B-WL,Group 2 - Module 2B WL,,100.0,Attended,Pass,Completed,2021-03-04,NaT,
4,10083,"REIMER, Terry",,terry.reimer@cnl.ca,Facility Manager,454.0,WL SITE & NUCLEAR OPERATIONS,"FILLION, Dave",ILT Class,WL-307,Job Safety Analysis,,,Cancelled,,,2021-04-13,NaT,


### Read SABA Data

#### Course Data

In [1365]:
courses = pd.read_csv("saba_courses.csv")

course_id = courses[["Course ID", "Version"]]
ids_only = pd.Series(courses["Course ID"].unique())

courses.head()

Unnamed: 0,Title,Version,Course ID,Unnamed: 3
0,100035-IBM Cognos Analytics Advanced (Cost Cou...,1.0,IT-9050,
1,100035-IBM Cognos Analytics Fundamentals (Cost...,1.0,IT-9049,
2,10 Commandments of Training,1.0,TD-1048,
3,12 GA Shotgun,1.0,SECU-3015,
4,5S Methodology (Cost Course),1.0,PI-9006-ONLINE,


#### Employee Data

In [1366]:
people = pd.read_csv("saba_people.csv")
people_un = set(people["Username"].astype("int64", errors="ignore").unique())
#people.head()
len(people_un)

3869

### Process Data

In [1367]:
def procStatus(pass_fail, comp_status, att_status):
    status = -1
    ## Fail
    if pass_fail == "Fail":
        status = 400
    elif (pass_fail == "Pass") & (comp_status == "Not Completed"):
        status = 400
    #elif (math.isnan(pass_fail)) & (math.isnan(comp_status)) & ((att_status == "No-show") | (att_status == "Replaced")):
    #    status = 400
    ## Pass
    elif (pass_fail == "Pass") & (comp_status != "Not Completed"):
        status = 200
    elif (pd.isnull(pass_fail)) & (comp_status == "Completed"):
        status = 200
        #print("no p/f, status is complete")
    elif ( (pd.isnull(pass_fail)) & (pd.isnull(comp_status)) & (att_status == "Attended") ):
        status = 200
        #print("no p/f, no status, attstatus is attended")
    else:
        status = 400
    return status

def checkPerson(unList, person):
    is_in = False
    if str(person) in unList:
        is_in = True
    return is_in

def checkCourse(courseList, courseID, version):
    is_in = False
    replace_version = version
    for _, row in courseList.iterrows():
        #print(courseID, version, row["Course ID"], row["Version"])
        if (courseID == row["Course ID"]):
            #print("Course Match", courseID, version, row["Course ID"], row["Version"])
            if pd.isnull(version):
                replace_version = row["Version"]
                #print("Insert Version", replace_version, version)
            #else:
                #print("Orig Version:", replace_version, version)
            is_in = True
            #print("Match:", courseID, replace_version, version)
    return is_in, replace_version


In [1368]:
# Helpers

class TranscriptRow:
    def __init__(self, username, courseID, courseVersion, compDate, status, courseTitle, inCourse, inPerson):
        self.username = username
        self.courseID = courseID.upper()
        self.courseVersion = courseVersion
        self.compDate = compDate
        self.status = status
        self.courseTitle = courseTitle
        self.inCourse = inCourse
        self.inPerson = inPerson

In [1369]:
# Process Data
source_used = source[[USERNAME, COURSEID, COURSEVERSION, COMPDATE, PASSFAIL, STATUS, ATTSTATUS, COURSETITLE]]

transcripts = []

with alive_bar(source_used.shape[0], force_tty=True) as bar_read:
    for index, trans in source_used.iterrows():
        username = trans[USERNAME]
        courseID = trans[COURSEID].upper()
        course_ver = trans[COURSEVERSION]
        comp_date = trans[COMPDATE].strftime('%Y-%m-%d')
        status = procStatus(trans[PASSFAIL], trans[STATUS], trans[ATTSTATUS])
        course_title = trans[COURSETITLE]
        in_course, rep_ver = checkCourse(course_id, courseID, course_ver)
        in_pers = checkPerson(people_un, username)

        #Update Version
        course_ver = rep_ver
        
        trans_entry = TranscriptRow(username, courseID, course_ver, comp_date, status, course_title, in_course, in_pers)
        transcripts.append(trans_entry)
        #print(courseID, type(courseID), courseID in ids_only.values)
        bar_read()
print(len(transcripts))

|████████████████████████████████████████| 1000/1000 [100%] in 1:06.5 (15.05/s)                                         : 1:23) ▃▁▃ 22/1000 [2%] in 2s (12.8/s, eta: 1:16)  ▅▃▁ 43/1000 [4%] in 3s (13.4/s, eta: 1:11)  ▂▂▄ 77/1000 [8%] in 5s (14.2/s, eta: 1:05)  ▆▄▂ 95/1000 [10%] in 7s (14.3/s, eta: 1:03) 140/1000 [14%] in 10s (14.5/s, eta: 59s)                                 (14.5/s, eta: 58s)  ▃▁▃ 180/1000 [18%] in 12s (14.4/s, eta: 57s)  216/1000 [22%] in 15s (14.1/s, eta: 56s) (14.1/s, eta: 55s) ▆▄▂ 240/1000 [24%] in 18s (13.6/s, eta: 56s)  ▃▁▃ 268/1000 [27%] in 20s (13.5/s, eta: 54s)  ▂▂▄ 271/1000 [27%] in 20s (13.5/s, eta: 54s)  ▄▆█ 329/1000 [33%] in 24s (13.7/s, eta: 49s)  391/1000 [39%] in 28s (13.9/s, eta: 44s)  █▆▄ 419/1000 [42%] in 30s (14.0/s, eta: 42s)  ▄▂▂ 486/1000 [49%] in 34s (14.4/s, eta: 36s) ▄▂▂ 567/1000 [57%] in 39s (14.7/s, eta: 30s)  ▃▁▃ 598/1000 [60%] in 41s (14.8/s, eta: 27s)  ▂▄▆ 635/1000 [64%] in 43s (14.9/s, eta: 25s)  746/1000 [75%] in 49s (15.2/s, eta: 17s) ▄

### Print to Output

In [1370]:
def write_row(df_output, trans, test=False, subset="ALL"):
    #df = df_output
    
    inCourse = trans.inCourse
    inPerson = trans.inPerson

    tmp_dict = {
        "LEARNER": trans.username,
        "COURSE": trans.courseID,
        "COURSE_VERSION": trans.courseVersion,
        "COMPLETION_DATE": trans.compDate,
        "COMPLETION_STATUS": trans.status,
        "CREATE_ADHOC_COURSE": "FALSE",
        "COURSE_TITLE": trans.courseTitle
    }
    if test == True:
        tmp_dict = {
            "LEARNER": trans.username,
            "COURSE": trans.courseID,
            "COURSE_VERSION": trans.courseVersion,
            "COMPLETION_DATE": trans.compDate,
            "COMPLETION_STATUS": trans.status,
            "CREATE_ADHOC_COURSE": "FALSE",
            "COURSE_TITLE": trans.courseTitle,
            "IN_COURSE": trans.inCourse,
            "IN_PERSON": trans.inPerson
        }
    if subset == "ALL":
        df_output = df_output.append(tmp_dict, ignore_index=True)
    elif subset == "CLEAN":
        if ( (inCourse == 1) & (inPerson == 1) ):
            df_output = df_output.append(tmp_dict, ignore_index=True)
    elif subset == "MISSING":
        if ( (inCourse == 0) | (inPerson == 0) ):
            tmp_dict["COURSE_GOOD"] = bool(inCourse)
            tmp_dict["PERSON_GOOD"] = bool(inPerson)
            df_output = df_output.append(tmp_dict, ignore_index=True)
    
    return df_output
    

In [1371]:
# Output Values
out_header = [
    "LEARNER",
    "COURSE",
    "COURSE_VERSION",
    "COMPLETION_DATE",
    "COMPLETION_STATUS",
    "CREATE_ADHOC_COURSE",
    "COURSE_TITLE"
]

In [1372]:
out = pd.DataFrame(columns = out_header)

with alive_bar(len(transcripts), force_tty=True) as bar:
    for ts in transcripts:
        out = write_row(out, ts, False, "CLEAN")
        bar()
out["ID"] = out.index
out.sample(10).T

|████████████████████████████████████████| 1000/1000 [100%] in 2.0s (503.33/s)                                          a: 3s)  ▆█▆ 888/1000 [89%] in 2s (506.4/s, eta: 0s) 


Unnamed: 0,324,190,1,255,74,588,107,477,182,341
LEARNER,10490,10490,10034,10404,10172,10681,10216,10730,10490,10664
COURSE,WL-168,WL-307,EMP-1036,WL-315,WL-318,FIRE-3001,WL-307,TD-2000-WL,ERM-1002,TD-2000-WL
COURSE_VERSION,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
COMPLETION_DATE,2021-06-10,2021-04-20,2022-03-08,2021-03-16,2021-03-23,2021-02-15,2021-03-11,2021-09-14,2021-06-22,2021-08-23
COMPLETION_STATUS,200,200,200,200,200,400,400,200,200,200
CREATE_ADHOC_COURSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
COURSE_TITLE,Waste Management & Packaging Fundamentals Comb...,Job Safety Analysis,Emergency Procedure Refresher,Respectful Work Place,Return to Work Radiation Protection Training f...,Fire Extinguisher - Practical,Job Safety Analysis,GET General Employee Training,Land Use Program (LUP) Awareness,GET General Employee Training
ID,324,190,1,255,74,588,107,477,182,341


In [1373]:
print(out.shape)
out.to_csv("out_transcript.csv", index=False)

(617, 8)


### Write Failiures

In [1374]:
out_err = pd.DataFrame(columns = out_header)

with alive_bar(len(transcripts), force_tty=True) as bar:
    for ts in transcripts:
        out_err = write_row(out_err, ts, False, "MISSING")
        bar()
out_err["ID"] = out_err.index
out_err.head(10).T

|████████████████████████████████████████| 1000/1000 [100%] in 3.4s (294.17/s)                                          (316.5/s, eta: 2s)  ▆▄▂ 573/1000 [57%] in 2s (256.4/s, eta: 2s) ▆█▆ 899/1000 [90%] in 3s (288.8/s, eta: 1s) 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
LEARNER,10083,10083,10083,10153,10153,10172,10172,10083,10083,10083
COURSE,SL-9002-,WL-202,900-510000-MCP-015 (REV 1),EMP-1037-ONLINE (REV 0),SECU-2008-ONLINE (REV 1.0),LGL-1003-ONLINE (REV 2.5),OSH-9045-ONLINE (REV 1),LGL-1003-ONLINE (REV 2.5),OSH-9045-ONLINE (REV 1),SL-9002
COURSE_VERSION,,,1.0,0.0,1.0,2.5,1.0,2.5,1.0,
COMPLETION_DATE,2021-10-21,2021-11-18,2021-03-29,2022-01-12,2021-04-01,2021-05-25,2021-07-06,2021-10-12,2021-09-22,2021-10-21
COMPLETION_STATUS,200,200,200,200,200,200,200,200,200,200
CREATE_ADHOC_COURSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
COURSE_TITLE,Nuclear Safety & Control Regulations and Act C...,Respirator Fit Test,Harassment and Violance Free Workplace (1.0),Emergency Procedure Refresher (0.0),Security Awareness Refresher (1.0),Values and Ethics at CNL (2.5),Cost Course - Hand Safety (1.0),Values and Ethics at CNL (2.5),Cost Course - Hand Safety (1.0),COST COURSE $1867/Person Nuclear Safety & Cont...
COURSE_GOOD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PERSON_GOOD,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ID,0,1,2,3,4,5,6,7,8,9


In [1375]:
print(out_err.shape)
out_err.to_csv("out_errors.csv", index=False)

(383, 10)


## Needed Courses and People

In [1376]:
course_need = out_err[out_err["COURSE_GOOD"] == False]
person_need = out_err[out_err["PERSON_GOOD"] == False]

pd.Series(course_need["COURSE"].unique(), name="Missing Courses").to_csv("out_courses.csv")
pd.Series(person_need["LEARNER"].unique(), name="Missing Learners").to_csv("out_people.csv")