In [None]:
%%bash

# Dependencies
pip install docx2txt
pip install gensim
pip install keras
pip install nltk
pip install -U scikit-learn
pip install python-docx
pip install tensorflow
pip install pandas
pip install openpyxl
pip install nltk
pip install spacy
pip install csv
python3 -m spacy download en_core_web_sm
python3 -m spacy download en_core_web_trf


In [None]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import pandas as pd
import csv
import math
import pprint
import re

import spacy
# Use if en_core_web_sm not installable via python3 in terminal:
# spacy.cli.download("en_core_web_sm")

In [None]:
lemmatizer = WordNetLemmatizer()

def verb_classifier(verbs_file_path): # verbs_file_path - string value that contains the file path

    xlsx = pd.ExcelFile(verbs_file_path, engine='openpyxl') 

    sheet_names = xlsx.sheet_names  # Get a list of sheet names

    # Create an empty dictionary to store DataFrames for each sheet
    dfs = {}

    for sheet_name in sheet_names:
        df = xlsx.parse(sheet_name)  # For XLSX files
        
        # Store the DataFrame in the dictionary
        dfs[sheet_name] = df

    domain_levels = pd.concat(dfs)
    duplicate_checklist = []

    for i in range(domain_levels.shape[0]):
        for j in range(domain_levels.shape[1]):
            cell_value = domain_levels.iloc[i, j]
            if not pd.isna(cell_value):
                cell_value_lower = cell_value.lower()
                verb = lemmatizer.lemmatize(cell_value_lower, pos="v")
                if verb not in duplicate_checklist:
                    domain_levels.iloc[i, j] = verb
                    duplicate_checklist.append(verb)
                else:
                    domain_levels.iloc[i, j] = float('nan')

    domain_levels = domain_levels.dropna(how='all')
    return domain_levels


# Paths
solo_file_path = 'SOLO.xlsx'
bloom_cognitive_file_path = 'Bloom_cognitive.xlsx'
bloom_psychomotor_file_path = 'Bloom_psychomotor.xlsx'
bloom_affective_file_path = 'Bloom_affective.xlsx'

# Verbs
mapped_verbs = {
    "Cognitive": verb_classifier(bloom_cognitive_file_path),
    "Affective": verb_classifier(bloom_affective_file_path),
    "Psychomotor": verb_classifier(bloom_psychomotor_file_path),
    "SOLO": verb_classifier(solo_file_path)
}


In [None]:
#### Check the dataframe for duplicates

# verbs_df - The dataframe that consists of the classified verbs
# domain_name - string name of the domain. (SOLO, Blooms Cognitive, Blooms Affective or Blooms Psychomotor)
def check_duplicates(verbs_df, domain_name): 

    ## Checking duplicates for SOLO
    list_of_lists = verbs_df.values.tolist()
    merged_list = [item for sublist in list_of_lists for item in sublist]

    # Remove nan
    cleaned_list = list(filter(lambda x: not pd.isna(x), merged_list))

    # Check for duplicate
    if len(cleaned_list) != len(set(cleaned_list)):
        print("Duplicates Found in " + domain_name)
        print(sorted(cleaned_list))
    else:
        print("No duplicates exist in " + domain_name)
    
    list_of_lists.clear()
    merged_list.clear()
    cleaned_list.clear()


# Check for duplicates
for taxonomy_key, taxonomy_item in mapped_verbs.items():
    check_duplicates(taxonomy_item, taxonomy_key)
    taxonomy_item.to_csv("./outputs/mapped_" + taxonomy_key + ".csv")


In [None]:
# Load the English language model in spaCy
# nlp = spacy.load('en_core_web_sm')
nlp = spacy.load('en_core_web_trf', exclude=['ner'])

## Function to identify verbs in a sentence
def identify_verbs(sentence):
    # Process the sentence using spaCy
    doc = nlp(sentence)
    
    # Extract the verbs from the processed sentence
    verbs = [token.lemma_ for token in doc if token.pos_ == 'VERB']
    
    return verbs

In [None]:
def extract_columns(csv_file, columns):
    extracted_data = {}
    
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        headers = next(reader)  # Read the headers
        
        # Check if all specified columns exist in the CSV file
        for column in columns:
            if column not in headers:
                raise ValueError(f"Column '{column}' not found in the CSV file.")
        
        # Initialize separate arrays for each column
        for column in columns:
            extracted_data[column] = []
        
        # Extract data from specified columns
        for row in reader:
            for column in columns:
                column_index = headers.index(column)
                extracted_data[column].append(row[column_index])
    
    return extracted_data

# Example usage
# csv_file = 'Learning outcomes manual mapping - Mappings.csv'
csv_file = 'Learning outcomes manual mapping - Mappings.csv'
# columns_to_extract = ['Learning outcomes', 'Final Bloom Level', 'Final SOLO Level']
columns_to_extract = ['LO', 'Cognitive', 'Affective', 'Psychomotor', 'SOLO']
extracted_data = extract_columns(csv_file, columns_to_extract)
sentences = extracted_data['LO']
final_levels = {
    "Cognitive": extracted_data['Cognitive'],
    "Affective": extracted_data['Affective'],
    "Psychomotor": extracted_data['Psychomotor'],
    "SOLO": extracted_data['SOLO']
}



In [None]:
## Main piece of code that performs the mapping - Approach 2 - 12% accuracy

# wiki word vectors no uppercase
# TODO: Modify this section of the code to use the bloom level verbs from Arragon's spreadsheet, will also need to modify
# Ideas for improving accuracy
#### Reduce the number of verbs.
#### Take more learning outcomes from the monash handbook website(need big dataset for this part) and identify verbs that are appearing multiple times 
#### The nummber of times that it appears could be set to a certain number ex: 5. 
#### If the verb doesnt appear atleast 5 times, we could remove the verb from our list of predefined verbs which will result in a shorter verb list

output_bloom_levels = []

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))

# Array of all the PLOs and ULOs (We can couple them together as we're trying to identify Bloom/Solo level here)
lo_sentence_array = []
bloom_levels_str = [
    "Remembering",
    "Understanding",
    "Applying",
    "Analysing",
    "Evaluating",
    "Creating",
]

# TODO: train CLO classification with all data instead of just one course.
for sentence in sentences:
    sentence = re.sub(r'[^\w\s]', '', sentence)
    # Process the sentence using spaCy
    doc = nlp(sentence)
    
    # Extract the verbs from the processed sentence
    cleaned_tokens = [token.lemma_ for token in doc if token.lower_ not in stop_words]
    lo_sentence_array.append(cleaned_tokens)

# build the vocabulary and train the model
# IMPORTANT, N0TE THAT sg=1 flag specifies Word2Vec to use the Skip Gram Model as designated by the LSTM paper.
model = Word2Vec(
    sentences=lo_sentence_array, vector_size=100, window=5, min_count=1, workers=4, sg=1, epochs=30
)

word_vectors = model.wv
# train the model with the course's ULOs and PLOs.
# model.train([tokens], total_examples=len([tokens]), epochs=10)

In [None]:
def bloom_mapping(sentences, final_levels):
    passed_mappings = {
            "Cognitive": 0,
            "Affective": 0,
            "Psychomotor": 0,
            "SOLO": 0
        }
    failed_mappings = {
            "Cognitive": 0,
            "Affective": 0,
            "Psychomotor": 0,
            "SOLO": 0
        }
    failed_cases = []
    min_score = 1

    for i in range(len(sentences)): # Iterates over the LOs
        identified_verbs = identify_verbs(sentences[i])

        score_list = {
            "Cognitive": {
                "Remembering": 0,
                "Understanding": 0,
                "Applying": 0,
                "Analysing": 0,
                "Evaluating": 0,
                "Creating": 0
            },
            "Affective": {
                "Receiving": 0,
                "Responding": 0,
                "Valuing": 0,
                "Organisation": 0,
                "Characterisation": 0
            },
            "Psychomotor": {
                "Perception": 0,
                "Set": 0,
                "Guided Response": 0,
                "Mechanism": 0,
                "Complex Overt Response": 0,
                "Adaptation": 0,
                "Origination": 0    
            },
            "SOLO": {
                "Prestructural": 0,
                "Unistructural": 0,
                "Multistructural": 0,
                "Relational": 0,
                "Extended Abstract": 0
            }
        }

        for taxonomy_key, taxonomy_item in mapped_verbs.items():
            # if final_levels[taxonomy_key][i] is None or not final_levels[taxonomy_key][i] or final_levels[taxonomy_key][i] == '-': continue
            for j in range(taxonomy_item.shape[0]): # Level
                for k in range(taxonomy_item.shape[1]): # Verb
                    verb = taxonomy_item.iloc[j, k]
                    
                    if verb is None or not verb or pd.isna(verb): continue

                    similarity_score = 0
                    for l in range(len(identified_verbs)):
                        try:    # Currently some of the 'verbs' identified are phrases rather than words and it was throwing errors so this is a temp solution 
                            sim_score = word_vectors.similarity(identified_verbs[l], verb)
                            if sim_score > 0.9999: similarity_score += sim_score
                        except:
                            pass
                    score_list[taxonomy_key][taxonomy_item.columns[k]] += similarity_score

        # Identify level based on similarity
        max_score = {
            "Cognitive": { "Level": None, "Score": 0 },
            "Affective": { "Level": None, "Score": 0 },
            "Psychomotor": { "Level": None, "Score": 0 },
            "SOLO": { "Level": None, "Score": 0 }
        }
        for t_key, t_item in score_list.items():
            for l in t_item:
                if max_score[t_key]["Score"] < score_list[t_key][l]:
                    max_score[t_key] = { "Level": l, "Score": score_list[t_key][l] }

        for t_key, t_item in final_levels.items():
            if final_levels[t_key][i] is None or not final_levels[t_key][i] or final_levels[t_key][i] == '-': continue
            
            if max_score[t_key]["Level"] != None and final_levels[t_key][i].lower() == max_score[t_key]["Level"].lower():
                passed_mappings[t_key] += 1
            else:
                failed_mappings[t_key] += 1
                sentence_data = {
                    "Domain": t_key,
                    "manually identified level": final_levels[t_key][i],
                    "automatically identified level": max_score[t_key]["Level"],
                    "verbs identified": identified_verbs,
                    "sentence": sentences[i],
                }
                failed_cases.append(sentence_data)

    # for case in failed_cases:
    #     print(case)
    with open('./outputs/failed_lo_mappings.csv', 'w', newline='') as file: 
        writer = csv.DictWriter(file, fieldnames = sentence_data.keys())
        writer.writeheader()
        writer.writerows(failed_cases)



    total_passed = 0
    total_failed = 0
    for taxonomy, passed in passed_mappings.items():
        total_passed += passed
        total_failed += failed_mappings[taxonomy]
        tot = passed + failed_mappings[taxonomy] if passed + failed_mappings[taxonomy] > 0 else 1
        mapping_percentage = math.ceil((passed/(tot))*100)
        print("Percentage of ", taxonomy, " mappings passed: ", mapping_percentage, "%")
        
    total_mapping_percentage = math.ceil((total_passed/(total_passed + total_failed))*100)
    print("Total percentage of mappings passed: ", total_mapping_percentage, "%")
    print(min_score)
    pass


### Todos
# Find a way to use skipgrams
# This method only works for blooms since this paper is only based on blooms mapping

bloom_mapping(sentences, final_levels)


In [None]:
def get_keys_by_value(dictionary, target_value):
    key_value = ""
    for key, value in dictionary.items():
        if value == target_value:
           key_value = key
    
    if key_value == "":
        key_value = list(dictionary.keys())[-1]
    return key_value

In [None]:
solo = mapped_verbs['SOLO']

## Rankings are manually derived using the followinng sources
## https://davenport.libguides.com/learningoutcomes/domains#:~:text=Bloom%20identified%20three%20domains%2C%20or,Psychomotor%20Skills%20or%20Physical%20Skills
## https://www.vectorsolutions.com/resources/blogs/teaching-skills-the-psychomotor-domain-of-learning-and-learning-objectives/

solo_levels_ranks = {
    "Prestructural": 0,
    "Unistructural": 1,
    "Multistructural": 2,
    "Relational": 3,
    "Extended Abstract": 4
}

bloom_cognitive_levels_ranks = {
    "Remembering": 0,
    "Understanding": 0,
    "Applying": 1,
    "Analysing": 2,
    "Evaluating": 3,
    "Creating": 4
}

bloom_affective_levels_ranks = {
    "Receiving": 0,
    "Responding": 1,
    "Valuing": 2,
    "Organisation": 3,
    "Characterisation": 4
}

bloom_psychomotor_levels_ranks = {
    "Perception": 0,
    "Set": 0,
    "Guided Response": 1,
    "Mechanism": 2,
    "Complex Overt Response": 3,
    "Adaptation": 3,
    "Origination": 4
}

# bloom_cognitive_copy = bloom_cognitive.copy()


In [None]:
def verb_ranking_identifier(current_taxonomy, comparison_taxonomy, verb, current_taxonomy_ranking_table, comparison_taxonomy_ranking_table):
    if verb in comparison_taxonomy.values:
        current_taxonomy_level = current_taxonomy[current_taxonomy == verb].stack().index[0][1]
        comparison_taxonomy_level = comparison_taxonomy[comparison_taxonomy == verb].stack().index[0][1]

        current_taxonomy_rank = current_taxonomy_ranking_table[current_taxonomy_level]
        comparison_taxonomy_rank = comparison_taxonomy_ranking_table[comparison_taxonomy_level]
        if verb=='relate':
            print("fail")

        ranking_range = current_taxonomy_rank - comparison_taxonomy_rank


        if abs(ranking_range) > 1:
            # print(verb + '-' + current_taxonomy_level + '-' + str(current_taxonomy_rank) + '-' + str(comparison_taxonomy_rank))
            new_current_taxonomy_rank = (current_taxonomy_rank + comparison_taxonomy_rank) // 2
            new_current_taxonomy_level = get_keys_by_value(current_taxonomy_ranking_table, new_current_taxonomy_rank)
            new_comparison_taxonomy_level = get_keys_by_value(comparison_taxonomy_ranking_table, new_current_taxonomy_rank)
            current_taxonomy_dict = {
                verb: new_current_taxonomy_level
            }
            comparison_taxonomy_dict = {
                verb: new_comparison_taxonomy_level
            }
            current_taxonomy_modifications.append(current_taxonomy_dict)
            comparison_taxonomy_modifications.append(comparison_taxonomy_dict)




def bloom_cog_remapping():
    global current_taxonomy_modifications
    current_taxonomy_modifications = []
    global comparison_taxonomy_modifications
    comparison_taxonomy_modifications = []

    global solo_copy 
    solo_copy = solo.copy(deep=True)
    solo_copy.reset_index(drop=True, inplace=True)

    global bloom_cognitive_copy
    bloom_cognitive_copy = mapped_verbs['Cognitive']
    bloom_cognitive_copy.reset_index(drop=True, inplace=True)

    global bloom_affective_copy
    bloom_affective = mapped_verbs['Affective']
    bloom_affective_copy = bloom_affective.copy(deep=True)
    bloom_affective_copy.reset_index(drop=True, inplace=True)

    global bloom_psychomotor_copy
    bloom_psychomotor = mapped_verbs['Psychomotor']
    bloom_psychomotor_copy = bloom_psychomotor.copy(deep=True)
    bloom_psychomotor_copy.reset_index(drop=True, inplace=True)

    # Check for solo verbs against other taxonomies. Identify new levels to put verbs into
    for i in range(bloom_cognitive_copy.shape[0]): # Level
        for j in range(bloom_cognitive_copy.shape[1]): # Verb
            verb = bloom_cognitive_copy.iloc[i, j]
            
            if verb is None or not verb or pd.isna(verb): continue

            verb_ranking_identifier(bloom_cognitive_copy, solo_copy, verb, bloom_cognitive_levels_ranks, solo_levels_ranks)
            verb_ranking_identifier(bloom_cognitive_copy, bloom_affective_copy, verb, bloom_cognitive_levels_ranks, bloom_affective_levels_ranks)
            verb_ranking_identifier(bloom_cognitive_copy, bloom_psychomotor_copy, verb, bloom_cognitive_levels_ranks, bloom_psychomotor_levels_ranks)


    for i in range(len(current_taxonomy_modifications)):
        current_item = current_taxonomy_modifications[i]
        verb = list(current_item.keys())[0]
        level = list(current_item.values())[0]
        row, col = bloom_cognitive_copy[bloom_cognitive_copy == verb].stack().index[0]
        bloom_cognitive_copy.at[row, col] = None
        temp_df = pd.DataFrame({
            level: verb
        }, index=[0])
        bloom_cognitive_copy = pd.concat([bloom_cognitive_copy, temp_df], ignore_index=True)


    for i in range(len(comparison_taxonomy_modifications)):
        current_item = comparison_taxonomy_modifications[i]
        verb = list(current_item.keys())[0]
        level = list(current_item.values())[0]
        temp_df = pd.DataFrame({
            level: verb
        }, index=[0])
        if level in solo_copy.columns:
            row, col = solo_copy[solo_copy == verb].stack().index[0]
            solo_copy.at[row, col] = None
            solo_copy = pd.concat([solo_copy, temp_df], ignore_index=True)
        elif level in bloom_affective_copy.columns:
            row, col = bloom_affective_copy[bloom_affective_copy == verb].stack().index[0]
            bloom_affective_copy.at[row, col] = None
            bloom_affective_copy = pd.concat([bloom_affective_copy, temp_df], ignore_index=True)
        elif level in bloom_psychomotor_copy.columns:
            row, col = bloom_psychomotor_copy[bloom_psychomotor_copy == verb].stack().index[0]
            bloom_psychomotor_copy.at[row, col] = None
            bloom_psychomotor_copy = pd.concat([bloom_psychomotor_copy, temp_df], ignore_index=True)
        else:
            print(level)
            print('level doesnt exist')

bloom_cog_remapping()



In [None]:

def check_verb_rankings(current_taxonomy, comparison_taxonomy, verb, current_taxonomy_ranking_table, comparison_taxonomy_ranking_table):
    if verb in comparison_taxonomy.values:
        current_taxonomy_level = current_taxonomy[current_taxonomy == verb].stack().index[0][1]
        comparison_taxonomy_level = comparison_taxonomy[comparison_taxonomy == verb].stack().index[0][1]

        current_taxonomy_rank = current_taxonomy_ranking_table[current_taxonomy_level]
        comparison_taxonomy_rank = comparison_taxonomy_ranking_table[comparison_taxonomy_level]

        ranking_range = current_taxonomy_rank - comparison_taxonomy_rank


        if abs(ranking_range) > 1:
            print(verb + '-' + current_taxonomy_level + '-' + str(current_taxonomy_rank) + '-' + str(comparison_taxonomy_rank) + '-' + comparison_taxonomy_level)


        
for i in range(bloom_cognitive_copy.shape[0]): # Level
    for j in range(bloom_cognitive_copy.shape[1]): # Verb
        verb = bloom_cognitive_copy.iloc[i, j]
        
        if verb is None or not verb or pd.isna(verb): continue
        
        check_verb_rankings(bloom_cognitive_copy, solo_copy, verb, bloom_cognitive_levels_ranks, solo_levels_ranks)
        check_verb_rankings(bloom_cognitive_copy, bloom_affective_copy, verb, bloom_cognitive_levels_ranks, bloom_affective_levels_ranks)
        check_verb_rankings(bloom_cognitive_copy, bloom_psychomotor_copy, verb, bloom_cognitive_levels_ranks, bloom_psychomotor_levels_ranks)

In [None]:

solo_copy


In [None]:
def remove_nan_values(taxonomy_verb_df):
    dict = {}
    for i in range(taxonomy_verb_df.shape[0]): # Level
        for j in range(taxonomy_verb_df.shape[1]): # Verb
            verb = taxonomy_verb_df.iloc[i, j]

            if verb is None or not verb or pd.isna(verb): continue

            verb_level = taxonomy_verb_df[taxonomy_verb_df == verb].stack().index[0][1]
            dict[verb_level] = verb

    
    new_df = pd.DataFrame(dict)
    return new_df


df = remove_nan_values(solo_copy)
df



