## PLO Mapper

## Installing necessary libraries

In [None]:
# %%bash

# # Dependencies
# pip install docx2txt
# pip install strsimpy
# pip install python-docx
# pip install pandas

# if ls docx2csv >/dev/null 2>&1; then
#     echo "docx2csv exists."
# else
#     echo "Folder does not exist. Cloning docx2csv."
#     git clone https://github.com/ivbeg/docx2csv.git
# fi

In [None]:
# %%bash

# source .env
# cd docx2csv && echo "$PASSWORD" | sudo -S python3 setup.py install

In [None]:
# ----- TEST DATA INPUT -----

# Computer Science Test Data.
# CURRENT_MAPPING="Lists_ComputerScience.docx"
# ORIGINAL_MAPPING="Original-Mapping-ComputerScience.csv"
# PO_LABEL="P"

# InformationSecurity Test Data.
# CURRENT_MAPPING="Lists_InformationSecurity.docx"
# ORIGINAL_MAPPING="Original-Mapping-InfoSecurity.csv"
# PO_LABEL="P"
ADJUSTMENT_THRESHOLD=0.35

# Monash Engineering Test Data.
CURRENT_MAPPING="Lists_MonashEngineering.docx"
ORIGINAL_MAPPING="Original-Mapping-MonashEngineering.csv"
PO_LABEL="PO"

# Program Outcome & Regulatory Body Requirements Table.
PROGRAM_OUTCOME_TABLE="Professional Body PO Comparisons.docx"

In [None]:
# importing necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# extract tables from word document
from docx2csv import extract_tables, extract
tables = extract_tables(CURRENT_MAPPING)

In [None]:
from docx import Document
document = Document(CURRENT_MAPPING)

In [None]:
def read_docx_table(document,table_num):
  table = document.tables[table_num-1]
  data = [[cell.text for cell in row.cells] for row in table.rows]
  df = pd.DataFrame(data)
  return df

## PLO TABLE

In [None]:
# Creating a dataframe for PLOs and it will accept 'n' number of PLOs
table_num=1
df = read_docx_table(document,table_num)
df.head(n=12)

In [None]:
q1 = df.copy()
df_po = df.copy()

In [None]:
# assigning count vectorizer
count_vectorizer = CountVectorizer(stop_words='english', min_df=0.005)

In [None]:
# Remove integers

# Data preprocessing for PLO dataframe
q1[1] = q1[1].str.lower()
corpus = q1[1].tolist()
corpii = count_vectorizer.fit_transform(corpus)
corpus

In [None]:
corpii

In [None]:
# extracting features names from PLO table
feature_names = count_vectorizer.get_feature_names_out()
feature_names

In [None]:
len(feature_names)

In [None]:
# Converting features to vector form and create a dataframe
X1 = pd.DataFrame(corpii.toarray(), columns=feature_names)

## CLO TABLE

In [None]:
# Creating a dataframe for CLOs and it will accept 'n' number of CLOs
table_num=2
df1 = read_docx_table(document,table_num)
p1 = df1.copy()
df_clo = df1.copy()

In [None]:
# Data preprocessing for CLO dataframe
p1[1] = p1[1].str.lower()
corpus11 = p1[1].tolist()
corpii11 = count_vectorizer.fit_transform(corpus11)

In [None]:
# extracting features names from CLO table
feature_names1 = count_vectorizer.get_feature_names_out()

In [None]:
len(feature_names1)

In [None]:
# Converting features to vector form and create a dataframe
X2 = pd.DataFrame(corpii11.toarray(), columns=feature_names1)
X2

In [None]:
X2.head()

In [None]:
# adding column index to the CLO table
U2 = pd.concat([df1[0], X2], axis=1)
U2.set_index(0, inplace=True)

In [None]:
U2.head()

In [None]:
# adding column index to the PLO table
U1 = pd.concat([df[0], X1], axis=1)
U1.set_index(0, inplace=True)

In [None]:
U1

## Intersection method for both CLOs and PLOs

### Generalised list of words

In [None]:
append_words = list(map(str.lower,['Cite', 'Define', 'Describe', 'Draw', 'Enumerate', 'Identify' 'Index', 'Indicate', 'Label', 'List', 'Match', 'Meet', 'Name', 'Outline', 'Point', 'Quote', 'Read', 'Recall', 'Recite', 'Recognize', 'Record', 'Repeat', 'Reproduce','Review',
'Select', 'State', 'Study', 'Tabulate', 'Trace', 'Write', 'Add', 'Approximate', 'Articulate', 'Associate', 'Characterize', 'Clarify', 'Classify', 'Compare', 'Compute', 'Contrast', 'Convert', 'Defend', 'Detail', 'Differentiate',
'Discuss', 'Distinguish', 'Elaborate', 'Estimate', 'Example', 'Explain', 'Express', 'Extend', 'Extrapolate', 'Factor', 'Generalize', 'Give', 'Infer', 'Interact', 'Interpolate', 'Interpret', 'Observe', 'Paraphrase', 'Picture graphically',
'Predict', 'Rewrite', 'Subtract', 'Summarize', 'Translate', 'Visualize', 'Acquire', 'Adapt', 'Allocate', 'Alphabetize', 'Apply', 'Ascertain', 'Assign', 'Attain', 'Avoid', 'Back up', 'Calculate', 'Capture', 'Change', 'Complete', 'Construct',
'Customize', 'Demonstrate', 'Depreciate', 'Derive', 'Determine', 'Diminish', 'Discover', 'Employ', 'Examine', 'Exercise', 'Explore', 'Expose', 'Figure', 'Graph', 'Handle', 'Illustrate', 'Interconvert', 'Investigate', 'Manipulate', 'Modify',
'Operate', 'Personalize', 'Plot','Practice', 'Prepare', 'Price', 'Process', 'Produce', 'Project', 'Provide', 'Relate', 'Round off', 'Sequence', 'Show', 'Simulate', 'Sketch', 'Solve', 'Subscribe', 'Transcribe', 'Use', 'Analyze', 'Audit',
'Blueprint', 'Breadboard', 'Break down', 'Confirm', 'Correlate', 'Detect', 'Diagnose', 'Diagram', 'Discriminate', 'Dissect', 'Document', 'Ensure', 'Figure out', 'File', 'Group', 'Interrupt', 'Inventory', 'Layout', 'Manage', 'Maximize',
'Minimize', 'Optimize', 'Order', 'Point out', 'Prioritize', 'Proofread', 'Query', 'Separate', 'Subdivide', 'Train', 'Transform', 'Appraise', 'Assess', 'Conclude', 'Counsel', 'Criticize', 'Critique', 'Evaluate', 'Grade', 'Hire', 'Judge',
'Justify', 'Measure', 'Prescribe', 'Rank', 'Rate', 'Recommend', 'Release', 'Support', 'Test', 'Validate', 'Verify', 'Abstract', 'Animate', 'Arrange', 'Assemble', 'Budget', 'Categorize', 'Code', 'Combine', 'Compile', 'Compose', 'Cope',
'Correspond', 'Create', 'Cultivate', 'Debug', 'Depict', 'Design', 'Develop', 'Devise', 'Dictate', 'Enhance', 'Facilitate', 'Format', 'Formulate', 'Generate', 'Import', 'Improve', 'Incorporate', 'Integrate', 'Interface', 'Join', 'Lecture',
'Model', 'Network', 'Organize', 'Overhaul', 'Plan', 'Portray', 'Program', 'Rearrange', 'Reconstruct', 'Reorganize', 'Revise', 'Specify']))

In [None]:
# using + operator to concat the generalised list of words to the PLO list
train_column = list(feature_names) + append_words

In [None]:
# CLO list of words
test_column = feature_names1
test_column

In [None]:
# Intersection method for extracting common column names from the tables (both CLO AND PLO)
# comparing whether the CLO column name is present in the PLO column names or not
train_column = list(feature_names) + append_words # (PLO table ) (# using + operator to concat PLO words and list of generalized words)
test_column = list(feature_names1)   # (CLO table)

In [None]:
# This is the column names from both the tables (using intersection)
common_column = list(set(train_column).intersection(set(test_column)))
common_column

In [None]:
print(common_column)

In [None]:
len(common_column)

In [None]:
# Filter the common column values from the CLO table
U3 = U2.filter(list(common_column), axis=1)

In [None]:
U3.head()

In [None]:
# extracting first row from PLO table and make a dataframe
Cs = []
for x in range(len(df)):
    Cs.append(U1.loc[[PO_LABEL+str(x+1)]])
# U1

In [None]:
# Concatenating these extracted each PLOs with 'n' number of CLOs
Dds = []
for x in range(len(df)):
    Dds.append(pd.concat([Cs[x],U3], sort=True))

In [None]:
# Filling the nan values of the concatenated dataframes
Ds = []
for x in range(len(df)):
    Ds.append(Dds[x].fillna(0))

## Calculate Cosine similarity

In [None]:
# Calculate cosine similarity for concatenated dataframes and create a new dataframe
for x in range(len(df)):
    Dds[x] = pd.DataFrame(cosine_similarity(Ds[x], dense_output=True))
Ds

In [None]:
# Extract the '0'th column because it has the CLO-PLO  cosine similarity values. We are neglecting the remaining ones.
# Renaming the '0'th column name to 'Pn' ['P1, P2, P3, P4, ... 'Pn']
for x in range(len(df)):
    Dds[x].rename(columns = {0 :PO_LABEL+str(x+1)}, inplace = True)

Dds

In [None]:
# Concatenating each  '0'th column from different cosine similarity dataframes
Ddn = []
for x in range(len(df)):
    Ddn.append(Dds[x][PO_LABEL+str(x+1)])

d = pd.concat(Ddn, axis=1)
d

In [None]:
# '0'th column gives us 1 which means each PLO map with own PLO.
# So we are removing that column.
dd = d[1:]
dd

In [None]:
# resetting index
dd.reset_index(inplace = True)
dd.drop(['index'], axis=1, inplace = True)
dd

In [None]:
# print the matrix
print(dd)

In [None]:
## This code loads the vector file into the word_vectors variable
## Download the vector file from https://fasttext.cc/docs/en/english-vectors.html (first file on the website), unzip the file and store in your local development folder
## Note: This piece of code may take upto an hour or two to run depending on your pc specs.
## My i5 8th gen with 8gig ram took 58mins to run.

# from gensim.models import KeyedVectors

# # Path to the downloaded .vec file
# path_to_vectors = 'wiki-news-300d-1M.vec'
# # path_to_vectors = 'wiki.en.vec'
# # Load the word vectors
# word_vectors = KeyedVectors.load_word2vec_format(path_to_vectors)

# # Find similar words
# similar_words = word_vectors.most_similar('cat')

# # Calculate word similarity
# similarity = word_vectors.similarity('cat', 'dog')

# # Perform vector arithmetic
# result = word_vectors['king'] - word_vectors['man'] + word_vectors['woman']


### LO Mapper Setup:
Setup section for the LO mapper.

In [None]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import pandas as pd
import csv
import math

# import pprint
import re
import json

import spacy

# Initialises dictionary containing American to UK spelling translations
american_to_british_dict = {}
american_to_british_path = "American-British-English-Translator.json"
with open(american_to_british_path, "r") as file:
    data = file.read()
american_to_british_dict = json.loads(data)


def britishise(sentence):
    """
    Convert words in a sentence to UK spelling to ensure consistency

    Input:
        sentence: An array of strings

    Output:
        sentence: An array of strings, which have been converted to UK spelling
    """

    for j in range(len(sentence)):
        try:
            sentence[j] = american_to_british_dict[sentence[j].lower()]
        except:
            pass

    return sentence

lemmatizer = WordNetLemmatizer()


def verb_classifier(verbs_file_path):  #
    """
    Takes an excel spreadsheet containing verbs, classifies the verbs and stores it into a Dataframe.


    Inputs:
        verbs_file_path: A string that contains the path to the excel spreadsheet to be read

    Outputs:
        domain_levels: A DataFrame which contains all the verbs from the spreadsheet classified into their respective levels
    """

    xlsx = pd.ExcelFile(verbs_file_path, engine="openpyxl")

    sheet_names = xlsx.sheet_names  # Get a list of sheet names

    # Create an empty dictionary to store DataFrames for each sheet
    dfs = {}

    for sheet_name in sheet_names:
        df = xlsx.parse(sheet_name)  # For XLSX files

        # Store the DataFrame in the dictionary
        dfs[sheet_name] = df

    domain_levels = pd.concat(dfs)
    duplicate_checklist = []

    # Iterate over all values in the spreadsheet
    for i in range(domain_levels.shape[0]):
        for j in range(domain_levels.shape[1]):
            cell_value = domain_levels.iloc[i, j]
            if not pd.isna(
                cell_value
            ):  # Format verbs (lower case, UK spelling, lemmatised format)
                cell_value_lower = cell_value.lower()
                verb_brit = britishise([cell_value_lower])[0]
                verb = lemmatizer.lemmatize(verb_brit, pos="v")

                if (
                    verb not in duplicate_checklist
                ):  # Check if the verb is already mapped
                    domain_levels.iloc[i, j] = verb
                    duplicate_checklist.append(verb)
                else:
                    domain_levels.iloc[i, j] = float("nan")

    domain_levels = domain_levels.dropna(how="all")
    return domain_levels

# Convert mapped_verbs DataFrames which is used to find verbs at a certain level, to a 'dictionary' to lookup the level of a verb
def generate_verb_list(mapped_verbs):
    tp_arr = []
    levels = []
    for t_key, t_item in mapped_verbs.items():
        print(t_item)
        columns = t_item.columns.values
        for x in range(t_item.shape[0]):
            for y in range(t_item.shape[1]):
                verb = t_item.iloc[x, y]
                if not pd.isna(verb):
                    tp_arr.append((t_key, verb))
                    levels.append(columns[y])

    index = pd.MultiIndex.from_tuples(tp_arr)
    verb_list = pd.DataFrame(levels, index=index, columns=["Level"])
    return verb_list

# Load the English language model in spaCy
# nlp = spacy.load('en_core_web_sm')
nlp = spacy.load("en_core_web_trf", exclude=["ner"])


## Function to identify verbs in a sentence
def identify_verbs(sentence):
    """
    Identify verbs within a sentence and lemmatise them (convert them into their base word)

    Inputs:
        sentence: A string

    Outputs:
        verbs: An array of strings representing identified verbs in their lemmatised form
    """

    # Process the sentence using spaCy
    doc = nlp(sentence)

    # Extract the verbs from the processed sentence
    verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]

    return verbs


def extract_columns(file_path, columns):
    extracted_data = {}

    with open(file_path, "r") as file:
        reader = csv.reader(file)
        headers = next(reader)  # Read the headers

        # Check if all specified columns exist in the CSV file
        for column in columns:
            if column not in headers:
                raise ValueError(f"Column '{column}' not found in the CSV file.")

        # Initialize separate arrays for each column
        for column in columns:
            extracted_data[column] = []

        # Extract data from specified columns
        for row in reader:
            for column in columns:
                column_index = headers.index(column)
                extracted_data[column].append(row[column_index])

    return extracted_data


# Paths
solo_file_path = "SOLO.xlsx"
bloom_cognitive_file_path = "Bloom_cognitive.xlsx"
bloom_psychomotor_file_path = "Bloom_psychomotor.xlsx"
bloom_affective_file_path = "Bloom_affective.xlsx"
# Verbs
mapped_verbs = {
    "Cognitive": verb_classifier(bloom_cognitive_file_path),
    "Affective": verb_classifier(bloom_affective_file_path),
    "Psychomotor": verb_classifier(bloom_psychomotor_file_path),
    "SOLO": verb_classifier(solo_file_path),
}

verb_list = generate_verb_list(mapped_verbs)
print(verb_list)

# Thresholds to filter the similarity of words to improve accuracy
suggested_sim_threshold = 0.985
sim_threshold = 0.997

# Example usage
csv_file = "Learning outcomes manual mapping - Mappings.csv"
# csv_file = 'Learning outcomes manual mapping - Mappings - Testing.csv'
columns_to_extract = ["LO", "Cognitive", "Affective", "Psychomotor", "SOLO"]

extracted_data = extract_columns(csv_file, columns_to_extract)

sentences = extracted_data["LO"]
final_levels = {
    "Cognitive": extracted_data["Cognitive"],
    "Affective": extracted_data["Affective"],
    "Psychomotor": extracted_data["Psychomotor"],
    "SOLO": extracted_data["SOLO"],
}

# extract tables from word document
from docx2csv import extract_tables
from docx import Document
tables = extract_tables(CURRENT_MAPPING)
document = Document(CURRENT_MAPPING)

def read_docx_table(document,table_num):
  table = document.tables[table_num-1]
  data = [[cell.text for cell in row.cells] for row in table.rows]
  df = pd.DataFrame(data)
  return df

training_sentences_4 = []

table_num=1
df_po = read_docx_table(document,table_num)
df_po.head(n=12)
for i in range(len(df_po[1])):
  training_sentences_4.append(df_po[1][i].replace("\n",""))

table_num=2
df1 = read_docx_table(document,table_num)
df_clo = df1.copy()
for i in range(len(df_clo[1])):
  training_sentences_4.append(df_clo[1][i].replace("\n",""))

sentences = training_sentences_4
# print(sentences)

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))

# Array of all the PLOs and ULOs (We can couple them together as we're trying to identify Bloom/Solo level here)
lo_sentence_array = []

# training_sentences = training_sentences_1
# training_sentences = training_sentences_2
# training_sentences = training_sentences_3
training_sentences = training_sentences_4

# TODO: train CLO classification with all data instead of just one course.
for sentence in training_sentences:
    sentence = re.sub(r"[^\w\s]", "", sentence)
    # Process the sentence using spaCy
    doc = nlp(sentence)

    # Extract the verbs from the processed sentence
    cleaned_tokens = [token.lemma_ for token in doc if token.lower_ not in stop_words]
    cleaned_tokens = britishise(cleaned_tokens)
    lo_sentence_array.append(cleaned_tokens)

# build the vocabulary and train the model
# IMPORTANT, N0TE THAT sg=1 flag specifies Word2Vec to use the Skip Gram Model as designated by the LSTM paper.
model = Word2Vec(
    sentences=lo_sentence_array,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    sg=1,
    epochs=30,
)

model.build_vocab(corpus_iterable=verb_list, update=True)
model.update_weights()

model_1_wv = model.wv

print("Method 1")
word_vectors = model_1_wv

# print("\nMethod 2")
# word_vectors = model_2_wv
# learning_outcome_mapping(sentences, final_levels, sim_threshold, suggested_sim_threshold)

In [None]:
def export_dataframe(df, path, sheet_name, columns):
# Export Failed Cases
    # print(failed_cases)
    with pd.ExcelWriter(path) as writer:
        df.to_excel(
            writer,
            sheet_name = sheet_name,
            columns = columns
        )

def calculate_accuracy(mappings, final_levels):
    passed_mappings = {"Cognitive": 0, "Affective": 0, "Psychomotor": 0, "SOLO": 0}
    failed_mappings = {"Cognitive": 0, "Affective": 0, "Psychomotor": 0, "SOLO": 0}
    columns = mappings.columns
    columns.append(pd.Index(data=["Manual Level"]))
    failed_cases = pd.DataFrame(columns=columns)

    # Calculate Accuracy
    total_passed = 0
    total_failed = 0
    for i in range(mappings.shape[0]):
        taxonomy = mappings.at[i, "Chosen Taxonomy"]
        if taxonomy and mappings.at[i, "Mapped Level"] == final_levels[taxonomy][i]:
            passed_mappings[taxonomy] += 1
            total_passed += 1
        else:
            total_failed += 1
            failed_cases.loc[total_failed] = mappings.loc[i].copy()
            if taxonomy:
                failed_mappings[taxonomy] += 1
                failed_cases.at[total_failed, "Manual Level"] = final_levels[taxonomy][i]

    for taxonomy in passed_mappings.keys():
        if failed_mappings[taxonomy] == 0:
            failed_mappings[taxonomy] = 1
        mapping_percentage = math.ceil((passed_mappings[taxonomy] / (passed_mappings[taxonomy] + failed_mappings[taxonomy])) * 100)
        print("Percentage of ", taxonomy, " mappings passed: ", mapping_percentage, "%")

    total_mapping_percentage = math.ceil(
        (total_passed / (total_passed + total_failed)) * 100
    )
    print("Total percentage of mappings passed: ", total_mapping_percentage, "%")

    export_dataframe(mappings, "./outputs/all_lo_mappings.xlsx", "all_mappings", mappings.columns)
    export_dataframe(failed_cases, "./outputs/failed_lo_mappings.xlsx", "failed_mappings", failed_cases.columns)

### Executing Learning Outcome Mapping Function

In [None]:
def learning_outcome_mapping(
    sentences, final_levels, SIM_THRESHOLD, SUGGESTED_SIM_THRESHOLD, calculate_accuracy_flag
):
    """
    Input:
        sentences: An array of Learning Outcomes (sentences) in string format.
        final levels: An dictionary of arrays. The dictionary keys are the taxonomies and the arrays contain strings representing the final mapped level of the corresponding learning outcome. If LO is not mapped to that domain leave null value
    """

    mappings = pd.DataFrame(
        columns=[
            "Sentence",
            "Chosen Taxonomy",
            "Mapped Level",
            "Verbs Identified",
            "Suggested Verbs"
        ]
    )

    for i in range(len(sentences)):  # Iterates over the LOs
        sentence = britishise(sentences[i])

        identified_verbs = identify_verbs(sentence)
        similar_verbs = {}

        score_list = {
            "Cognitive": {
                "Remembering": 0,
                "Understanding": 0,
                "Applying": 0,
                "Analysing": 0,
                "Evaluating": 0,
                "Creating": 0,
            },
            "Affective": {
                "Receiving": 0,
                "Responding": 0,
                "Valuing": 0,
                "Organisation": 0,
                "Characterisation": 0,
            },
            "Psychomotor": {
                "Perception": 0,
                "Set": 0,
                "Guided Response": 0,
                "Mechanism": 0,
                "Complex Overt Response": 0,
                "Adaptation": 0,
                "Origination": 0,
            },
            "SOLO": {
                "Prestructural": 0,
                "Unistructural": 0,
                "Multistructural": 0,
                "Relational": 0,
                "Extended Abstract": 0,
            },
        }

        for taxonomy_key, taxonomy_item in mapped_verbs.items():
            similar_verbs[taxonomy_key] = pd.DataFrame(columns=["Level", "Similarity"])
            for identified_verb in identified_verbs:
                for k in range(taxonomy_item.shape[1]):  # Col (Level)
                    for j in range(taxonomy_item.shape[0]):  # Row
                        current_level =taxonomy_item.columns[k]
                        verb = taxonomy_item.iloc[j, k]

                        if verb is None or not verb or pd.isna(verb):
                            continue

                        similarity_score = 0
                        try:  # Currently some of the 'verbs' identified are phrases rather than words and it was throwing errors so this is a temp solution
                            sim_score = word_vectors.similarity(identified_verb, verb)
                            if (
                                sim_score >= SUGGESTED_SIM_THRESHOLD
                                and identified_verb != verb
                            ):
                                similar_verbs[taxonomy_key].at[
                                    verb, "Level"
                                ] = current_level
                                similar_verbs[taxonomy_key].at[
                                    verb, "Similarity"
                                ] = sim_score
                            if sim_score >= SIM_THRESHOLD:
                                similarity_score += sim_score
                        except:
                            pass
                        score_list[taxonomy_key][
                            current_level
                        ] += similarity_score

        # Determine Taxonomy and Identify level based on similarity
        max_score = {
            "Cognitive": {"Level": None, "Score": 0},
            "Affective": {"Level": None, "Score": 0},
            "Psychomotor": {"Level": None, "Score": 0},
            "SOLO": {"Level": None, "Score": 0},
        }
        for t_key, t_item in score_list.items():
            x = mappings.shape[0]

            for l in t_item:
                if max_score[t_key]["Score"] < score_list[t_key][l]:
                    max_score[t_key] = {"Level": l, "Score": score_list[t_key][l]}

            if max_score[t_key]["Level"] != None:
                # Generate output for sentence
                mappings.at[x, "Index"] = i
                mappings.at[x, "Sentence"] = sentence
                mappings.at[x, "Chosen Taxonomy"] = t_key
                mappings.at[x, "Mapped Level"] = max_score[t_key]["Level"]

                # Generate identified verb and level tuples for sentence data
                s_d_identified_verbs = []
                for verb in identified_verbs:
                    level = "Verb not mapped"
                    try:
                        level = verb_list.at[(t_key, verb), "Level"]
                    except:
                        pass
                    finally:
                        s_d_identified_verbs.append((verb, level))

                mappings.at[x, "Verbs Identified"] = s_d_identified_verbs

                # Generate suggested verbs
                sim_verbs = (
                    similar_verbs[t_key]
                    .sort_values(by=["Similarity"], ascending=False)
                    .head(5)
                )
                suggested_verbs = []
                if sim_verbs.shape[0] > 0:
                    suggested_verbs = [
                        (verb, sim_verbs.at[verb, "Level"]) for verb in sim_verbs.index
                    ]
                mappings.at[x, "Suggested Verbs"] = suggested_verbs

    if calculate_accuracy_flag:
        calculate_accuracy(mappings, final_levels)

    return mappings

# TODO: Classify the verbs in each of the learning outcomes
ulo_sentences = p1[1].to_list()
plo_sentences = df[1].to_list()

# Thresholds to filter the similarity of words to improve accuracy
suggested_sim_threshold = 0.985
sim_threshold = 0.997

# Example usage
csv_file = "Learning outcomes manual mapping - Mappings.csv"
# csv_file = 'Learning outcomes manual mapping - Mappings - Testing.csv'
columns_to_extract = ["LO", "Cognitive", "Affective", "Psychomotor", "SOLO"]

extracted_data = extract_columns(csv_file, columns_to_extract)

final_levels = {
    "Cognitive": extracted_data["Cognitive"],
    "Affective": extracted_data["Affective"],
    "Psychomotor": extracted_data["Psychomotor"],
    "SOLO": extracted_data["SOLO"],
}

ulo_classifications = learning_outcome_mapping(ulo_sentences, final_levels, sim_threshold, suggested_sim_threshold, True)
plo_classifications = learning_outcome_mapping(plo_sentences, final_levels, sim_threshold, suggested_sim_threshold, False)

# Iteration 1: Assume both CLOs and POs are classifiable into Cognitive, Affective, Psychomotor Levels.
# If levels match, boost the CLO to PO coefficient by 0.1
# If levels different, don't boost

In [None]:
plo_classifications

### Regulatory Mapping using the LO Mapper
Uses the LO Mapper to factor in Program Outcome alignment for the curriculum mapper. Note we were not able to achieve any increase in accuracy through using the LO Mapper.

In [None]:

def regulatory_requirement_mapping(SIM_THRESHOLD, SUGGESTED_SIM_THRESHOLD):

    document = Document(PROGRAM_OUTCOME_TABLE)
    table_num = 1
    po_table = read_docx_table(document,table_num)

    final_levels = []

    # PO1 to PO12
    for i in range(1,13):
        current_po_rr_sentences = []
        for j in range(1,4):
            split_sentences = po_table[j][i].split(";")
            for k in range(len(split_sentences)):
                split_sentences[k] = split_sentences[k].replace("\n","")
            while("" in split_sentences):
                split_sentences.remove("")
            while(" " in split_sentences):
                split_sentences.remove(" ")

            for k in range(len(split_sentences)):
                current_po_rr_sentences.append(split_sentences[k])

        # print(current_po_rr_sentences)
        rr_clasifications = learning_outcome_mapping(current_po_rr_sentences, final_levels, SIM_THRESHOLD, SUGGESTED_SIM_THRESHOLD, False)

        mapped_levels = []
        for j in range(len(rr_clasifications["Mapped Level"])):
            mapped_levels.append(rr_clasifications["Mapped Level"][j])
        final_level = max(mapped_levels,key=mapped_levels.count)
        cnt = 0
        for level in mapped_levels:
            if level == final_level:
                cnt += 1
                
        if plo_classifications['Mapped Level'][i-1] != final_level and cnt > 1:
            final_levels.append(final_level)
        else:
            final_levels.append(plo_classifications['Mapped Level'][i-1])
    return final_levels

# final_levels = regulatory_requirement_mapping(sim_threshold, suggested_sim_threshold)
# for i in range(len(final_levels)):
#     plo_classifications["Mapped Level"][i] = final_levels[i]
# plo_classifications

### ADJUSTING THRESHOLDS USING THE LEVELS FOUND FOR LO Mapper & PO Mapper.

In [None]:
# Copy the original dd dataframe to compare w/ adjusted mappings.
dd_original = dd.copy()
print(dd_original)

In [None]:
# for classification in ulo_classifications:
#     print(classification)

# BASE ALGORITHM ACCURACY (w/ no learning outcome mapper), ADJUSTMENT_THRESHOLD=0:
# 0.8544061302681993
# ALGORITHM ACCURACY (w/ new learning outcome mapper), ADJUSTMENT_THRESHOLD=0.1:
# 0.8927203065134102
# ALGORITHM ACCURACY (w/ new learning outcome mapper), ADJUSTMENT_THRESHOLD=0.15:
# 0.8936781609195404
# ALGORITHM ACCURACY (w/ new learning outcome mapper), ADJUSTMENT_THRESHOLD=0.2:
# 0.8946360153256706
# ALGORITHM ACCURACY (w/ new learning outcome mapper), ADJUSTMENT_THRESHOLD=0.25:
# 0.8946360153256707
# ALGORITHM ACCURACY (w/ new learning outcome mapper), ADJUSTMENT_THRESHOLD=0.3:
# 0.8869731800766286
# ALGORITHM ACCURACY (w/ new learning outcome mapper), ADJUSTMENT_THRESHOLD=0.4:
# 0.8764367816091956
# ALGORITHM ACCURACY (w/ new learning outcome mapper), ADJUSTMENT_THRESHOLD=0.5:
# 0.8678160919540232
# ALGORITHM ACCURACY (w/ new learning outcome mapper), ADJUSTMENT_THRESHOLD=1:
# 0.8678160919540232

data = {
    'ulo': [],
    'plo': [],
    'ulo_level': [],
    'plo_level': [],
}
po_mappings = pd.DataFrame(data)

# Goes through each cell in the LO->PO Mapping table and adjusts the value using a static ADJUSTMENT_THRESHOLD Value.
for x in range(len(plo_classifications['Mapped Level'])):
    for i in range(len(ulo_classifications['Mapped Level'])):

        new_row = pd.DataFrame({'ulo': [ulo_classifications['Sentence'][i]], 'plo': [plo_classifications['Sentence'][x]], 'ulo_level': [ulo_classifications['Mapped Level'][i]], 'plo_level': [plo_classifications['Mapped Level'][x]]})
        if ulo_classifications['Mapped Level'][i] == plo_classifications['Mapped Level'][x]:
            
            po_mappings = pd.concat([new_row, po_mappings], ignore_index=True)
            # po_mappings.loc[0] = [ulo_classifications['Sentence'][i], plo_classifications['Sentence'][x], ulo_classifications['Mapped Level'][i],plo_classifications['Mapped Level'][x]]  # adding a row
        else:
            po_mappings = po_mappings.append(new_row, ignore_index=True)
            # po_mappings.loc[-1] = [ulo_classifications['Sentence'][i], plo_classifications['Sentence'][x], ulo_classifications['Mapped Level'][i],plo_classifications['Mapped Level'][x]]  # adding a row
        # po_mappings.index = po_mappings.index + 1  # shifting index

        if ulo_classifications['Mapped Level'][i] == plo_classifications['Mapped Level'][x]:
            if dd[PO_LABEL+str(x+1)][i] + ADJUSTMENT_THRESHOLD <= 1:
                dd[PO_LABEL+str(x+1)][i] += ADJUSTMENT_THRESHOLD # Add Offset
            else:
                dd[PO_LABEL+str(x+1)][i] = 1
        else:
            if dd[PO_LABEL+str(x+1)][i] - ADJUSTMENT_THRESHOLD >= 0:
                dd[PO_LABEL+str(x+1)][i] -= ADJUSTMENT_THRESHOLD # Add Offset
            else:
                dd[PO_LABEL+str(x+1)][i] = 0

In [None]:
# print(po_mappings["ulo_level"]+"-"+po_mappings["plo_level"])m
print(po_mappings)
export_dataframe(po_mappings, "./outputs/po_mappings.xlsx", "po_mappings", po_mappings.columns)

In [None]:
# store the matrix into csv file
dd.to_csv('pseudocodematrix.csv', index=False)

## Setting threshold value (taking min and max of each column and divided by 2)
## threshold value = (min +max)/2

In [None]:

# TODO: Alter the threshold based on matching hierarchy type & bloom verb instead of simply using (column_max+column_min)/2

# Setting threshold value
# Taking min max average of each column and set that as a threshold value

# This will change the coefficients into 0 or 1 mappings in the dd dataframe
for x in range(len(df)):
    tes = dd[PO_LABEL+str(x+1)].values.min()
    tes1 = dd[PO_LABEL+str(x+1)].values.max()
    tt1 = (tes+tes1)/2

    if tt1 == 0:
      dd[PO_LABEL+str(x+1)] = dd[PO_LABEL+str(x+1)] 
    else:
      dd[PO_LABEL+str(x+1)] = dd[PO_LABEL+str(x+1)].apply(lambda x: 1 if x >= tt1 else 0)

In [None]:

# TODO: Alter the threshold based on matching hierarchy type & bloom verb instead of simply using (column_max+column_min)/2

# Setting threshold value 
# Taking min max average of each column and set that as a threshold value

# This will change the coefficients into 0 or 1 mappings in the dd dataframe
for x in range(len(df)):
    tes = dd_original[PO_LABEL+str(x+1)].values.min()
    tes1 = dd_original[PO_LABEL+str(x+1)].values.max()
    tt1 = (tes+tes1)/2
    
    if tt1 == 0:
      dd_original[PO_LABEL+str(x+1)] = dd_original[PO_LABEL+str(x+1)] 
    else:
      dd_original[PO_LABEL+str(x+1)] = dd_original[PO_LABEL+str(x+1)].apply(lambda x: 1 if x >= tt1 else 0)

In [None]:

dd.to_csv('PLO-CLOmapping.csv', index=False)

In [None]:
dd.head()

In [None]:
# human generated output
d= pd.read_csv(ORIGINAL_MAPPING)
d.head()

In [None]:
# Duplicate over a dataframe for accuracy calculate & comparison.
df3 = d.copy()
df3_original = d.copy()

In [None]:
# Check whether the levels of the original and automatic mapping match or not.
for x in range(len(df)):
  df3[PO_LABEL+str(x+1)] = np.where(dd[PO_LABEL+str(x+1)] == df3[PO_LABEL+str(x+1)], 'True', 'False')
  df3_original[PO_LABEL+str(x+1)] = np.where(dd_original[PO_LABEL+str(x+1)] == df3_original[PO_LABEL+str(x+1)], 'True', 'False')
  

In [None]:
df3.head()

In [None]:
df3_original.head()

In [None]:
# Transform into something more readable than 1s or 0s
for x in range(len(df)):
  df3[PO_LABEL+str(x+1)] = df3[PO_LABEL+str(x+1)].replace('True', 1)
  df3[PO_LABEL+str(x+1)] = df3[PO_LABEL+str(x+1)].replace('False', 0)
  
  df3_original[PO_LABEL+str(x+1)] = df3_original[PO_LABEL+str(x+1)].replace('True', 1)
  df3_original[PO_LABEL+str(x+1)] = df3_original[PO_LABEL+str(x+1)].replace('False', 0)

In [None]:
df3.head()

In [None]:
df3_original.head()

In [None]:
# calculating accuracy of the table
df3['acc'] = df3.mean(axis=1)
df3.head(n=100)

# calculating accuracy of the table
df3_original['acc'] = df3_original.mean(axis=1)
df3_original.head(n=100)

In [None]:
df4 = pd.concat([df1[0],df1[1], df3], axis=1)
df4.head(n=100)

In [None]:
df4_original = pd.concat([df1[0],df1[1], df3_original], axis=1)

df4["acc_improved"] = df4['acc'] >= df4_original['acc']
df4["acc_improved_by"] = df4['acc'] - df4_original['acc']

df4_original.head(n=100)

In [None]:
df4.set_index(0, inplace=True)
df4.head(n=100)


df4.to_csv('PO_Mapper_WasMappingSuccessful.csv')

In [None]:
df4_original.set_index(0, inplace=True)
df4_original.head(n=100)

df4_original.to_csv('PO_Mapper_WasMappingSuccessful_original.csv')

In [None]:
merged_df = df4.merge(df4_original,how='inner')
# merged_df['PO1']
# merged_df = merged_df[merged_df['Value1'] == merged_df['Value2']]


In [None]:
# Base algorithm accuracy
df4_original['acc'].mean()

In [None]:
# Accuracy with LO mapper weightings factored in.
df4['acc'].mean()