In [None]:
!pip install -U sentence-transformers

In [None]:
"""
Author: Dhivya

Description: Assessment of Transfer Credit using Verb Clustering

"""
import pandas as pd
import spacy
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
import numpy as np
from sentence_transformers import SentenceTransformer
from sentence_transformers import util

In [None]:
class Preprocessing(object):
  def remove_empty(self, lo_list):
    lo = [x for x in lo_list if x != 'nan']
    return lo

In [None]:
class Taxonomic_Similarity(object):

    """[summary]

    [description]
    """
    def __init__(self,):
      self.nlp = spacy.load("en_core_web_sm")
      nltk.download('wordnet')
      nltk.download('punkt')
      self.lemmatizer = WordNetLemmatizer()

    def possible_verb(self, word): 
      s = set(s.pos() for s in wn.synsets(word))
      if 'v' in s:
        return True 

    def detect_verbs_spacy(self, text):
      doc = self.nlp(text)
      doc1 = word_tokenize(text)
      verbs = []
      for token in doc:
          if token.pos_=='VERB':
              if self.possible_verb(token.text):
                verbs.append(token.text)                   
      if not verbs:
        verb2 = []
        for token1 in doc1:
          if self.possible_verb(token1):
              verb2.append(token1)
        return verb2
      else:
        return verbs
      
    def form_verb_clusters(self, verb_list):
      knowledge_list = ['arrange','define','duplicate','label','list','match','memorize',
                    'name','order','outline','recognize','relate','recall','repeat','reproduce','state']

      comprehension_list = ['explain','paraphrase','classify','convert','defend','discuss','distinguish',
                            'estimate','explain','express','extend','generalized','indicate','infer',
                            'locate','predict','rewrite','review','translate']

      application_list = ['use','compute','solve','demonstrate','discover','construct','compute','dramatize','apply',
                          'change','employ','interpret','manipulate','modify','operate','practice','produce','schedule',
                          'show','sketch','solve']

      synthesis_list = ['analyze','create','design','hypothesize','invent','develop','arrange','assemble','collect','combine','comply',
                  'devise','explain','formulate','generate','plan','rearrange','reconstruct','tell','synthesize','revise','reorganize']

      evaluation_list = ['judge','recommend','critique','justify','appraise','argue','assess','attach','conclude','defend','discriminate',
                        'estimate','evaluate','explain','interpret','select','support','predict','relate','rate','value']

      cluster_list = [knowledge_list,comprehension_list,application_list,synthesis_list,evaluation_list]
      for i in range(len(verb_list)):
          verb_list[i]= self.lemmatizer.lemmatize(verb_list[i], wn.VERB)
      class_list = []
      for i in range(len(verb_list)):
          if verb_list[i] in knowledge_list:
                  class_list.append(1)
          elif  verb_list[i] in comprehension_list:
                  class_list.append(2)
          elif  verb_list[i] in application_list:
                  class_list.append(3)
          elif  verb_list[i] in synthesis_list:
                  class_list.append(4)
          elif  verb_list[i] in evaluation_list:
                  class_list.append(5)
          else:
                  cluster_id = self.find_verb_cluster(verb_list[i],cluster_list)
                  class_list.append(cluster_id)
      return class_list

    def verb_similarity(self, v1, v2):
      sim1 = []
      synset1 = wn.synsets(v1,pos='v')
      synset2 = wn.synsets(v2,pos='v')
      w1 = synset1[0]
      w2 = synset2[0]
      for syn1 in synset1:
            for syn2 in synset2:
                sim1.append(syn1.wup_similarity(syn2))              
      wup_max_sim = max(sim1)
      return wup_max_sim     
    
    def find_verb_cluster(self, new_verb,cluster_list):
      avg_val_list = []
      sil_width_list = []
      for cluster in cluster_list:
            avg_val = 0
            sim_list = []
            for verb in cluster:
                sim = self.verb_similarity(new_verb,verb)
                if sim==None:
                  sim = 0
                sim_list.append(sim)
            avg_val = np.sum(sim_list)/len(sim_list)
            avg_val_list.append(avg_val)
      for value in avg_val_list:
            rem_clusters = avg_val_list.copy()
            rem_clusters.remove(value)
            neig_cluster = min(rem_clusters)
            sil_width = (neig_cluster - value) / max(neig_cluster,value)
            sil_width_list.append(sil_width)
      cluster_ind = sil_width_list.index(max(sil_width_list))
      return cluster_ind+1

    def assign_cluster(self, verb_list):
        cluster_list = []
        print(len(verb_list))
        for i in range(len(verb_list)): 
            clu = []
            if (verb_list[i]==[]):
                clu = [0]
            else:
                #print(verb_list[i])
                clu = self.form_verb_clusters(verb_list[i])
            cluster_list.append(clu)
        cluster = [max(x) for x in cluster_list]
        return cluster

    def shift_grid(self, rlo_cluster,slo_cluster,rl,sl):
          print(rlo_cluster,slo_cluster)
          shift_list =[]
          counter = 0
          for i in rlo_cluster:
              tmp_val = []
              for j in slo_cluster:
                  counter = counter +1
                  # print(counter)
                  shift_value = abs(i-j)
                  tmp_val.append(shift_value)
              shift_list.append(tmp_val)
          df1 = pd.DataFrame(shift_list)
          df1['index'] = rl
          df1.set_index('index',inplace=True)
          df1.columns = sl
          return df1, shift_list
      
    def shift_grid(self, rlo_cluster,slo_cluster,rl,sl):
        print(rlo_cluster,slo_cluster)
        shift_list =[]
        counter = 0
        for i in rlo_cluster:
            tmp_val = []
            for j in slo_cluster:
                counter = counter +1
                shift_value = abs(i-j)
                tmp_val.append(shift_value)
            shift_list.append(tmp_val)
        df1 = pd.DataFrame(shift_list)
        df1['index'] = rl
        df1.set_index('index',inplace=True)
        df1.columns = sl
        return df1, shift_list

In [None]:
class Semantic_Similarity(object):

  def __init__(self,):
    self.model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
  
  def calculate_similarity(self, s1, s2):
    sentence_vec1 = self.model.encode(s1, convert_to_tensor=True)
    sentence_vec2 = self.model.encode(s2, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(sentence_vec1, sentence_vec2)
    cosine_scores = cosine_scores.cpu().numpy()
    return cosine_scores[0][0]
  
  def similarity_grid(self, sent_1,sent_2):
    sim_list = []
    df = pd.DataFrame()
    df['index'] = sent_1
    df.set_index('index',inplace=True)
    for w1 in sent_2:
        temp_val = []
        for w2 in sent_1:
            sim_value = self.calculate_similarity(w1,w2)
            temp_val.append(sim_value)
        sim_list.append(temp_val)
        df[w1] = temp_val
    return df, sim_list


In [None]:
class Aggregation(object):
  def final_similarity(self,similarity_list,st_list,impact):
    final_sim_list = []
    for m in range(len(similarity_list)):
      temp_sim = []
      for n in range(len(similarity_list[m])):
          lo_sim = similarity_list[m][n]*(100-impact)
          cluster_sim = impact-((impact/6)*st_list[m][n])
          final_sim = lo_sim + cluster_sim
          temp_sim.append(final_sim)
      final_sim_list.append(temp_sim)
    final_sim_list = [list(x) for x in zip(*final_sim_list)]
    return final_sim_list

  def course_similarity(self, final_sim_list,sim_threshold):
    lo_sim_category = [False]*len(final_sim_list)
    for i in range(len(final_sim_list)):
        if max(final_sim_list[i])>sim_threshold : lo_sim_category[i]=True
    true_count = lo_sim_category.count(True)
    false_count = lo_sim_category.count(False)
    print(lo_sim_category)
    if true_count >= len(lo_sim_category)/2:
        course_sim = 'Similar'
    else: 
        course_sim = 'Not Similar'
    return course_sim

In [None]:
def test_function():
  los = pd.read_csv('/content/drive/MyDrive/course_comparisons/course2.csv')
  impact = 30
  sim_threshold=60
  rlo = list(map(str,los['RLO'].tolist()))
  slo = list(map(str,los['SLO'].tolist()))
  preprocessing_object = Preprocessing()
  tax_object = Taxonomic_Similarity()
  sem_object = Semantic_Similarity()
  agg_object = Aggregation()

  rlo = preprocessing_object.remove_empty(rlo)
  slo = preprocessing_object.remove_empty(slo)

  rlo_verb_list = [tax_object.detect_verbs_spacy(x) for x in rlo]
  slo_verb_list = [tax_object.detect_verbs_spacy(x) for x in slo] 

  rlo_cluster = tax_object.assign_cluster(rlo_verb_list)
  slo_cluster = tax_object.assign_cluster(slo_verb_list)

  st_grid, st_list = tax_object.shift_grid(rlo_cluster,slo_cluster,rlo,slo)
  st_list = [list(x) for x in zip(*st_list)]
  
  sim_grid, similarity_list = sem_object.similarity_grid(rlo,slo)
  final_sim_list = agg_object.final_similarity(similarity_list,st_list,impact)
  result = agg_object.course_similarity(final_sim_list,sim_threshold)
  print("The two courses taken into consideration are " + result)

In [None]:
if __name__ == '__main__':
    test_function()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
7
9
[2, 4, 5, 4, 4, 5, 4] [2, 4, 4, 4, 5, 4, 5, 4, 5]
[True, False, True, True, False, True, False]
The two courses taken into consideration are Similar
