In [16]:
%%bash

# Dependencies
pip install docx2txt
pip install gensim
pip install keras
pip install nltk
pip install -U scikit-learn
pip install python-docx
pip install tensorflow
pip install nltk
pip install spacy

if ls docx2csv >/dev/null 2>&1; then
    echo "docx2csv exists."
else
    echo "Folder does not exist. Cloning docx2csv."
    git clone https://github.com/ivbeg/docx2csv.git
fi

Requirement already up-to-date: scikit-learn in /home/edward/.local/lib/python3.8/site-packages (1.2.2)
Collecting click<9.0.0,>=7.1.1
  Downloading click-8.1.3-py3-none-any.whl (96 kB)
Installing collected packages: click
Successfully installed click-8.1.3
docx2csv exists.


In [2]:
%%bash

source .env
cd docx2csv && echo "$PASSWORD" | sudo -S python3 setup.py install

running install
running bdist_egg
running egg_info
writing docx2csv.egg-info/PKG-INFO
writing dependency_links to docx2csv.egg-info/dependency_links.txt
writing entry points to docx2csv.egg-info/entry_points.txt
writing requirements to docx2csv.egg-info/requires.txt
writing top-level names to docx2csv.egg-info/top_level.txt
reading manifest file 'docx2csv.egg-info/SOURCES.txt'
writing manifest file 'docx2csv.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
creating build/bdist.linux-x86_64/egg
creating build/bdist.linux-x86_64/egg/docx2csv
copying build/lib/docx2csv/converter.py -> build/bdist.linux-x86_64/egg/docx2csv
copying build/lib/docx2csv/core.py -> build/bdist.linux-x86_64/egg/docx2csv
copying build/lib/docx2csv/__init__.py -> build/bdist.linux-x86_64/egg/docx2csv
copying build/lib/docx2csv/__main__.py -> build/bdist.linux-x86_64/egg/docx2csv
byte-compiling build/bdist.linux-x86_64/egg/docx2csv/converter.py to co

[sudo] password for edward: 

In [3]:
# ----- TEST DATA INPUT -----

# Computer Science Test Data.
# CURRENT_MAPPING="Lists_ComputerScience.docx"
# ORIGINAL_MAPPING="Original-Mapping-ComputerScience.csv"

# InformationSecurity Test Data.
CURRENT_MAPPING="Lists_InformationSecurity.docx"
ORIGINAL_MAPPING="Original-Mapping-InfoSecurity.csv"

In [4]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package punkt to /home/edward/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/edward/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/edward/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# extract tables from word document
from docx2csv import extract_tables, extract
tables = extract_tables(CURRENT_MAPPING)


In [6]:
from docx import Document
document = Document(CURRENT_MAPPING)

In [7]:
# Array of all the PLOs and ULOs (We can couple them together as we're trying to identify Bloom/Solo level here)
lo_sentence_array = []

# TODO: train CLO classification with all data instead of just one course.
for table in document.tables:
    for row in table.rows:
        for cell in row.cells[1:]:
            tokens = nltk.word_tokenize(cell.text)
            cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.lower() not in stop_words]
            lo_sentence_array.append(cleaned_tokens)

# build the vocabulary and train the model
# IMPORTANT, N0TE THAT sg=1 flag specifies Word2Vec to use the Skip Gram Model as designated by the LSTM paper.
model = Word2Vec(sentences=lo_sentence_array,vector_size=100, window=5, min_count=1, workers=4, sg=1)

# train the model with the course's ULOs and PLOs.
model.train([tokens], total_examples=len([tokens]), epochs=10)

lo_sentence_array

[['Analyze',
  'complex',
  'computing',
  'problem',
  'apply',
  'principle',
  'computing',
  'relevant',
  'discipline',
  'identify',
  'solution',
  '.'],
 ['Design',
  ',',
  'implement',
  ',',
  'evaluate',
  'computing-based',
  'solution',
  'meet',
  'given',
  'set',
  'computing',
  'requirement',
  'context',
  'program',
  '’',
  'discipline',
  '.'],
 ['Communicate', 'effectively', 'variety', 'professional', 'context', '.'],
 ['Recognize',
  'professional',
  'responsibility',
  'make',
  'informed',
  'judgment',
  'computing',
  'practice',
  'based',
  'legal',
  'ethical',
  'principle',
  '.'],
 ['Function',
  'effectively',
  'member',
  'leader',
  'team',
  'engaged',
  'activity',
  'appropriate',
  'program',
  '’',
  'discipline',
  '.'],
 ['Apply',
  'security',
  'principle',
  'practice',
  'maintain',
  'operation',
  'presence',
  'risk',
  'threat',
  '.'],
 ['Demonstrate',
  'computational',
  'thinking',
  'skill',
  'solve',
  'computing',
  'proble

In [8]:
import docx2txt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import RMSprop

# load in the Word document using docx2txt
doc_text = docx2txt.process(CURRENT_MAPPING)

# preprocess the text using NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# tokenize the text and remove stop words and non-alphabetic characters
tokens = [word.lower() for word in word_tokenize(doc_text) if word.isalpha() and word.lower() not in stop_words]

# lemmatization
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]


# convert tokens back to single string format
corpus = ' '.join(lemmatized_tokens)

# create a tokenizer and fit on the corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])

# convert the text to a sequence of integers
# sequences = tokenizer.texts_to_sequences([corpus])

# pad the sequences to have a fixed length
max_length = 50
# padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# LSTM model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50

model = Sequential([
    tf.keras.layers.Embedding(10000, 16),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dropout(0.2), # Dropout rate set to 0.2 as specified from the paper
    tf.keras.layers.Dense(6, activation='softmax')
])

model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=3, activation='softmax'))

# RMS Optimizer as specified by the paper.
optimizer = RMSprop(learning_rate=0.001)

# compile the model
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# train the model
# X_train and y_train are assumed to be already defined
# 
# model.fit(X_train, y_train, epochs=10, batch_size=32)

2023-05-17 18:45:19.188516: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-17 18:45:21.891780: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-05-17 18:45:21.891845: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-05-17 18:45:22.638161: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-05-17 18:45:27.290880: W tensorflow/stream_executor/platform/de

In [9]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

## training data LO
## sentence = "use big data streaming technologies."
## word = "apply"
## categories = ["Remembering", "Understanding", "Applying", "Analysing", "Evaluating", "Creating"]



# Load the TensorBoard notebook extension
%load_ext tensorboard

SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

sentence = "The wide road shimmered in the hot sun"
tokens = list(sentence.lower().split())
print(len(tokens))

vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(vocab)

inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(len(positive_skip_grams))


8
{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}
{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}
[1, 2, 3, 4, 5, 1, 6, 7]
26


In [34]:
%%bash
## packages to install

pip install gensim
pip install spacy
python3 -m spacy download en_core_web_sm

2023-05-17 19:31:06.151393: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-17 19:31:06.300076: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-05-17 19:31:06.300133: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-05-17 19:31:06.320966: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-05-17 19:31:06.846371: W tensorflow/stream_executor/platform/de

CalledProcessError: Command 'b'## packages to install\npython3 -m spacy download en_core_web_sm\n\n# %%python\n\nimport spacy\nspacy.cli.download("en_core_web_sm")\n'' returned non-zero exit status 2.

In [None]:
import spacy
spacy.cli.download("en_core_web_sm")

In [12]:
## This code loads the vector file into the word_vectors variable
## Download the vector file from https://fasttext.cc/docs/en/english-vectors.html (first file on the website), unzip the file and store in your local development folder
## Note: This piece of code may take upto an hour or two to run depending on your pc specs.
## My i5 8th gen with 8gig ram took 58mins to run.

from gensim.models import KeyedVectors

# Path to the downloaded .vec file
path_to_vectors = 'wiki-news-300d-1M.vec'
# Load the word vectors
word_vectors = KeyedVectors.load_word2vec_format(path_to_vectors)

# Find similar words
similar_words = word_vectors.most_similar('cat')

# Calculate word similarity
similarity = word_vectors.similarity('cat', 'dog')

# Perform vector arithmetic
result = word_vectors['king'] - word_vectors['man'] + word_vectors['woman']




In [13]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

## training data LO
## sentence = "use big data streaming technologies."
## word = "apply"
## categories = ["Remembering", "Understanding", "Applying", "Analysing", "Evaluating", "Creating"]



# Load the TensorBoard notebook extension
%load_ext tensorboard

SEED = 42
AUTOTUNE = tf.data.AUTOTUNE


def generate_skipgrams(sentence):
    tokens = list(sentence.lower().split())
    print(len(tokens))

    vocab, index = {}, 1  # start indexing from 1
    vocab['<pad>'] = 0  # add a padding token
    for token in tokens:
      if token not in vocab:
        vocab[token] = index
        index += 1
    vocab_size = len(vocab)
    print(vocab)

    inverse_vocab = {index: token for token, index in vocab.items()}
    print(inverse_vocab)

    example_sequence = [vocab[word] for word in tokens]
    print(example_sequence)

    window_size = 2
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          example_sequence,
          vocabulary_size=vocab_size,
          window_size=window_size,
          negative_samples=0)
    
    return positive_skip_grams

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [26]:
import spacy

## Function to identify verbs in a sentence
def identify_verbs(sentence):
    # Load the English language model in spaCy
    nlp = spacy.load('en_core_web_sm')
    
    # Process the sentence using spaCy
    doc = nlp(sentence)
    
    # Extract the verbs from the processed sentence
    verbs = [token.lemma_ for token in doc if token.pos_ == 'VERB']
    
    return verbs

In [32]:
## Main piece of code that performs the mapping 

sentences = [
    "apply common data analytics and machine learning algorithms in a big data environment.",
    "use big data streaming technologies."
]
bloom_levels = ["Remembering", "Understanding", "Applying", "Analysing", "Evaluating", "Creating"]

# identified_levels = []
final_level = None

for i in range(len(sentences)):
    verbs = identify_verbs(sentences[i])
    print(verbs)
    score = 0
    for j in range(len(verbs)):
        for k in range(len(bloom_levels)):
            similarity_score = word_vectors.similarity(verbs[j], bloom_levels[k])
            if similarity_score >= score:
                score=similarity_score
                final_level = bloom_levels[k]
    print("Sentence: ", sentences[i], " Identified blooms level: ", final_level)


### Todos
# Find a way to use skipgrams
# This method only works for blooms since this paper is only based on blooms mapping


['apply']
Sentence:  apply common data analytics and machine learning algorithms in a big data environment.  Identified blooms level:  Applying
['use']
Sentence:  use big data streaming technologies.  Identified blooms level:  Applying
