In [1]:
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
from flair.data import Sentence
from flair.models import SequenceTagger
from scipy.optimize import linear_sum_assignment
from nltk.tokenize import sent_tokenize
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


2022-06-09 14:09:23,506 loading file /Users/elantonfernandes/.flair/models/chunk-english/5b53097d6763734ee8ace8de92db67a1ee2528d5df9c6d20ec8e3e6f6470b423.d81b7fd7a38422f2dbf40f6449b1c63d5ae5b959863aa0c2c1ce9116902e8b22
2022-06-09 14:09:24,919 SequenceTagger predicts: Dictionary with 45 tags: <unk>, O, B-NP, E-NP, I-NP, S-PP, S-VP, S-SBAR, S-ADVP, S-NP, S-ADJP, B-VP, E-VP, B-PP, E-PP, I-VP, S-PRT, B-ADVP, E-ADVP, B-ADJP, E-ADJP, B-CONJP, I-CONJP, E-CONJP, I-ADJP, B-SBAR, E-SBAR, S-INTJ, I-ADVP, I-PP, B-UCP, I-UCP, E-UCP, S-LST, B-PRT, I-PRT, E-PRT, S-CONJP, B-INTJ, E-INTJ, I-INTJ, B-LST, E-LST, <START>, <STOP>


In [None]:
class ActiveLearner:
    
    def __init__(self, model='all-MiniLM-L6-v2', chunker="flair/chunk-english", feedback_file_name):
        self.embedder_model = SentenceTransformer(model)
        self.tagger = SequenceTagger.load(chunker)
        self.feedback_file_name = feedback_file_name
        pass
    
    def get_uncertainty_samples(self,data,percentage,question_id):
        """
        Arguments:
            data: File containing the samples with question, reference answer, and student ansewer.
            percentage: Defines the percentage of samples that need to be used in the active learnig 
                        process.
            question_id: The question id corresponding for a set of questions.
        """
        uncertain_samples_df = data.loc[(data['Answer ID']==question_id)]

        return uncertain_samples_df.nsmallest(int(len(uncertain_samples_df.index)*percentage),
                                              columns="Uncertainty Score")
    
    def compute_uncertainty_scores(self):
        pass
    
    def save_file(self):
        pass
    
    def train(self, strategy, data, estimator, query_strategy):
        if strategy == '1':
            
            """
                * Direct sentence comparison and addition of feedback and score to the whole answer.
                * Sample student answers based on the uncertainty score (lowest first)
                * The number of samples is decided by the percentage set by the oracle.
                * Design the extraction based on pool based active learning to easily 
                    delete samples after they have been used for training.
            """
            print("Applying strategy 1")
            #think of how to make the learners to be passed into the function and
            #not create them here.
            
            feedback_learner = self.create_learner(estimator,query_strategy)
            score_learner = self.create_learner(estimator,query_strategy)
            
            
            for row_index, row_value in data.iterrows():
                
                query_answer = row_value['Query Answer']
                print("Model answer:\n",query_answer)
                
                student_answer = row_value['Student Answer']
                print("Student answer:\n",student_answer)
                
                #creating an embedding
                student_answer_embedding = self.embedder_model.encode(student_answer)
                
                #acquiring feedback
                feedback_id = self.select_add_feedback(self.feedback_file_name)
                
                #acquire associated score
                print("\nWhat score is assigned to this answer?")
                score = input("\nEnter score:\n")
                
                #train the learner with this sample.
                feedback_learner.teach(student_answer_embedding.reshape(1,-1),feedback_id)
                score_learner.teach(student_answer_embedding.reshape(1,-1),score)
                
    
        elif strategy == '2' or strategy == '3':
            """
                * break down the answer into its chunks.
                * Ask the oracle to form facts with the given chunks.
                * Add feedback to the fact and a grade and save this to a different .csv file.
            """
            print("Applying strategy 2")
            feedbacks = pd.read_csv(self.feedback_file_name)
            for row_index, row_value in data_samples.iterrows():
    
                query_answer = row_value['Query Answer']
                print("Model answer:\n",query_answer)

                query_chunker = Sentence(query_answer)
                self.tagger.predict(query_chunker)

                student_answer = row_value['Student Answer']
                print("Student answer:\n",student_answer)

                passage_chunker = Sentence(student_answer)
                self.tagger.predict(passage_chunker)

                for ind,chunk in enumerate(passage_chunker.get_spans('np')):
                    print(ind,":",chunk.text)
                    
                fact_list = list()
                feedback_id_list = list()
                score_list = list()
                print("Select chunks to form facts from the above list")
                
                while(True):
                    chunk_list = list()
                    fact_create = int(input("Do you wish to create a fact? y:1/n:0?"))
                    print(fact_create)
                    if fact_create:

                        while(True):
                            inp = input("Enter the chunk id related to one fact or x to exit")
                            if inp == 'x':
                                fact_list.append(chunk_list)
                                break
                            else:
                                chunk_list.append(passage_chunker.get_spans('np')[int(inp)].text)

                        #acquiring feedback
                        feedback_id = self.select_add_feedback(self.feedback_file_name)
                        feedback_id_list.append(feedback_id)
                        
                        #acquiring score
                        print("\nWhat score contribution would you associate with this fact?")
                        score = input("\nEnter score:\n")
                        score_list.append(score)

                    else:
                        break
                
                print("\nFact list",fact_list)
                print("\nFeedback list",feedback_id_list)
                print("\nScore list",score_list)
                if len(fact_list)!=0:
                    for fact_entry,feedback_entry, fact_score in zip(fact_list,feedback_id_list,score_list):
                        text = ' '.join(fact_entry)
                        fback = feedbacks[feedback_entry]
                        data = [[text,fact_score,fback]]
                        print(data)

                        temp = pd.DataFrame(
                        data = data,columns=['Fact','Score','Feedback'])
                        fact_dataframe = pd.concat([fact_dataframe,temp],ignore_index=True)
                        fact_dataframe.to_csv('facts_new.csv',index=False)
                        
                fact_dataframe = pd.read_csv('facts.csv')
                for row_id,row_value in fact_dataframe.iterrows():
                    fact_embedding = self.embedder_model.encode(row_value['Fact'])
                    feedback_index = np.array([feedbacks.index(row_value['Feedback'])],dtype=int)
                    #print("\n",type(feedback_index))
                    score_index = np.array([score_list.index(row_value['Score'])],dtype=int)
                    #print(type(score_index))
                    feedback_learner.teach(fact_embedding.reshape(1,-1),feedback_index)
                    score_learner.teach(fact_embedding.reshape(1,-1),score_index)
        else:
            print("Wrong strategy id selected.")
            break
            
            

            
    def predict(self, data, strategy, feedback_learner, score_learner):
        feedbacks = pd.read_csv(self.feedback_file_name)['Feedbacks']
        if strategy == '1':
            
            for row_index, row_value in data.iterrows():
                query_answer = row_value['Query Answer']
                print("\n==========================================")
                print("Model answer:\n",query_answer)
                student_answer = row_value['Student Answer']
                print("Student answer:\n",student_answer)
                student_answer_embedding = self.embedder_model.encode(student_answer)
                #print("Student answer Embedding",student_answer_embedding.reshape(1,-1))

                feedback = feedback_learner.predict(student_answer_embedding.reshape(1,-1))
                print("Corresponding feedback:\n",feedbacks[feedback[0]])

                score = score_learner.predict(student_answer_embedding.reshape(1,-1))
                print("Corresponding feedback:\n",score)
                
        elif strategy == '2':
            
            """
                * model and student answer are made into chunks.
                * linear sum asssignemnt is applied to these chunks to 
                    find the best possible pairs.
                * The matching chunks from student answer are converted 
                    into a sentence again.
                * This sentence is used in the active learner prediction.
            """
            
            for row_index, row_value in dataframe.iterrows():
                
                query_answer = row_value['Query Answer']
                query_chunker = Sentence(query_answer)
                self.tagger.predict(query_chunker)

                student_answer = row_value['Student Answer']
                passage_chunker = Sentence(student_answer)
                self.tagger.predict(passage_chunker)

                similarity_score_matrix = self.get_chunk_similarity_score_matrix(
                    query_chunker.get_spans('np'),
                    passage_chunker.get_spans('np'))

                #get the linear sum assigned rows and columns for the query and passage chunks
                row_ind, col_ind = linear_sum_assignment(similarity_score_matrix,maximize=True)
                #print(type(row_ind), col_ind)
                #row_ind = row_ind.sort()
                #col_ind = col_ind.sort()
                query_text = ' '.join([query_chunker.get_spans('np')[x].text for x in row_ind])
                passage_text = ' '.join([passage_chunker.get_spans('np')[x].text for x in col_ind])
                print("\n===========================================================================")
                print("\nBefore linear sum assignment")
                print("\nQuery text:\n",query_answer)
                print("\nPassage text:\n",student_answer)
                print("\nAfter linear sum assignment.")
                print("\nQuery text:\n",query_text)
                print("\nPassage text:\n",passage_text)

                text_embed = self.embedder_model.encode(passage_text)
                feedback_id = feedback_learner.predict(text_embed.reshape(1,-1))
                #print(feedback_ind[0])
                print("Feedback given: ",feedbacks[feedback_id[0]])
                score = score_learner.predict(text_embed.reshape(1,-1))
                print("Score given: ",score)

                """student_answer_embedding = embedder_model.encode(student_answer)
                #print("Student answer Embedding",student_answer_embedding.reshape(1,-1))

                feedback = learner1.predict(student_answer_embedding.reshape(1,-1))
                print("Feedback id:",feedback)
                print("Corresponding feedback\n",feedbacks_s1[feedback[0]])"""            
            
        elif strategy = '3':
            """
                * student answer in converted into its individual sentences.
                * ench sentence is passed through the active learner to get the individual feedback and score.
            """
            
            for row_index, row_value in data.iterrows():
                student_answer = row_value['Student Answer']
                print("\n=============================================================")
                print("Student answer",student_answer)
                student_answer_sents = sent_tokenize(student_answer)

                feedback_score_pairs = list()
                for i,sent in enumerate(student_answer_sents):
                    sent_embed = self.embedder_model.encode(sent)
                    feedback_id = feedback_learner.predict(sent_embed.reshape(1,-1))
                    score = score_learner.predict(sent_embed.reshape(1,-1))
                    feedback_score_pairs.append([feedbacks[feedback_id[0]],score_ind[0]])
                    print("\nAssociated feedback(s) for sentence: ",i+1," : ",feedbacks[feedback_id[0]],
                          "\n Associated score",score)
                

    
    def create_learner(self, estimator = RandomForestClassifier(),query_strategy=uncertainty_sampling):
        """
        Arguments:
            estimator: Decides which type of classifier should be used for the learning action.
            query_strategy: Decides the form of sampling that needs to be performed on the data pool.
        """
        return ActiveLearner(
            estimator = estimator,
            query_strategy = query_strategy)
    
    def select_add_feedback(self,feedback_file_name):
        feedbacks = pd.read_csv(feedback_file_name)['Feedbacks']
        print("What feedback would you give this answer?")
        for i,f in enumerate(feedbacks):
            print(i,":",f)
            
        feedback_option = input("\n0: Add from existing feedback.\n1: Add new feedback.")
        if feedback_option == '0':
            feedback_id = np.array([int(input("\nSelect feedback ID: "))], dtype=int)
        elif feedback_option == '1':
            feedback_statement = input("\nEnter new feedback: ")

            feedbacks.append(feedback_statement)
            feedback_id = np.array([feedbacks.index(feedback_statement)], dtype=int)
            print("feedback ID: ",feedback_id)
        else:
            print("\nWrong ID selected: ")
         
        return feedback_id
        #learner1.teach(student_answer_embedding.reshape(1,-1),feedback_id_s1)
    
    def generate_score(self):
        pass
    
    def store_facts(self,facts_data,columns,filename):
        facts_df = pd.DataFrame(data=facts_data,index=False)
        facts_df.to_csv(filename,index=False)
        pass
    
    def get_facts(self,filename):
        return pd.read_csv(filename)
    
    def get_chunk_similarity_matrix(self, query_chunks, passage_chunks, threshold=0.7):
        similarity_score_matrix = np.zeros((len(query_chunks),len(passage_chunks)))

        for i,query_entity in enumerate(query_chunks):
            for j,passage_entity in enumerate(passage_chunks):
                
                similarity_score = util.dot_score(
                    self.embedder_model.encode(query_entity.text), 
                    self.embedder_model.encode(passage_entity.text))
                
                if similarity_score >= threshold:
                    similarity_score_matrix[i][j]= similarity_score
                else:
                    similarity_score_matrix[i][j] = 0

        return similarity_score_matrix