In [2]:
import pandas as pd
import re
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import torch
import transformers as ppb
import numpy as np
from numpy.linalg import norm
import xml.etree.ElementTree as ET


In [5]:

def read_xml():
    xml_data = open(r"grade_data.xml", 'r').read()  # Read file
    root = ET.XML(xml_data)  # Parse XML
    return root

def convert_xml_to_dataframe(root):
    all_data=[]
    Columns_name=[]
    for i, child in enumerate(root):
        data_in_one_row=[]
        for subchild in child:
            if subchild.tag=="MetaInfo":
                data_in_one_row.append(subchild.get('TaskID'))
            elif subchild.tag=="Annotation":
                data_in_one_row.append(subchild.get('Label'))
            else:
                data_in_one_row.append(subchild.text)
            if i==0:
                Columns_name.append(subchild.tag)
        all_data.append(data_in_one_row)
    df_all_data = pd.DataFrame(all_data)  # Write in DF and transpose it
    df_all_data .columns = Columns_name  # Update column names
    
    return df_all_data 

def extract_labels_from_Annotation_column(annotation:str):
    annotation_to_numbers=re.findall("\d",annotation)
    return annotation_to_numbers.index("1")+1


def add_label_column_to_dataframe(df_all_data):
    df_all_data['Label']=df_all_data.Annotation.apply(extract_labels_from_Annotation_column)
    return df_all_data

def convert_ReferenceAnswers_as_list_of_answers(ReferenceAnswers:str):
    list_of_answers=re.sub("\d:","",ReferenceAnswers)[1:-1].split("\n")
    return list_of_answers

def add_list_of_answers_as_new_columns(df_all_data):
    df_all_data['list_of_answers']=df_all_data.ReferenceAnswers.apply(convert_ReferenceAnswers_as_list_of_answers)
    return df_all_data


def import_Bert_model():
    Bert_model_class, Bert_tokenizer_class, Bert_pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
    Bert_tokenizer = Bert_tokenizer_class.from_pretrained(Bert_pretrained_weights)
    Bert_model = Bert_model_class.from_pretrained(Bert_pretrained_weights)
    return Bert_tokenizer,Bert_model
        
def text_to_BERT_Features(text:str):

    tk=Bert_tokenizer.encode(text, add_special_tokens=True)
    
    max_len = 100   #this number depends on the length of the answers
    pad=np.array([tk + [0]*(max_len-len(tk))])
    attention_mask = np.where(pad != 0, 1, 0)
    
    input_ids = torch.tensor(pad)  
    attention_mask = torch.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = Bert_model(input_ids.to(torch.long), attention_mask=attention_mask)
    features = last_hidden_states[0][:,0,:].numpy()
    return features[0]

def list_of_text_to_BERT_Features(list_of_text:list):
    feature_list=[]
    for text in list_of_text:
        feature_list.append(text_to_BERT_Features(text))
    return feature_list

def new_column_for_Answers_BERT_features(df_all_data):
    df_all_data['Answers_Bert_Features']=df_all_data.Answer.apply(text_to_BERT_Features)
    return df_all_data

def new_column_for_ReferenceAnswers_BERT_features(df_all_data):  
    df_all_data['list_of_ReferenceAnswers_Bert_Features']=df_all_data.list_of_answers.apply(list_of_text_to_BERT_Features)
    return df_all_data   


def cosine_similarity(array1,array2):
    return np.dot(array1,array2)/(norm(array1)*norm(array2))

def find_the_highest_similarity_answer(list_of_array,array1):
    highest_similarity=0
    for array2 in list_of_array:
        new_similarity=cosine_similarity(array1,array2)
        if new_similarity>highest_similarity:
            highest_similarity=new_similarity
            highest_array=array2
    return highest_array
        
def add_new_column_contain_highest_similarity(df_all_data):
        list_of_highest=[]
        list_answers=list(df_all_data.list_of_ReferenceAnswers_Bert_Features)
        for i, array1 in enumerate(df_all_data.Answers_Bert_Features):
            the_highest=find_the_highest_similarity_answer(list_answers[i], array1)
            list_of_highest.append(the_highest)
        df_all_data['highest_answers']=list(list_of_highest)
        return df_all_data 
    
def concatenate_all_features(df_all_data):
    list_of_concatenated_arrays=[]
    for i in range(len(df_all_data)):
        concatenated_arrays=np.concatenate((df_all_data.Answers_Bert_Features[i],df_all_data.highest_answers[i]), axis = 0)
        list_of_concatenated_arrays.append(concatenated_arrays)
    df_all_data['all_features']=list_of_concatenated_arrays
    return df_all_data


In [11]:
root=read_xml()
df_all_data=convert_xml_to_dataframe(root)
df_all_data=add_label_column_to_dataframe(df_all_data)
df_all_data=add_list_of_answers_as_new_columns(df_all_data)
df_all_data

Unnamed: 0,MetaInfo,ProblemDescription,Question,Answer,Annotation,ReferenceAnswers,Label,list_of_answers
0,LP03_PR09.bLK.sh,"A car windshield collides with a mosquito, squ...",How does Newton's third law apply to this situ...,the windshield will apply a force to the mosqu...,correct(0)|correct_but_incomplete(1)|contradic...,\n1: Since the windshield exerts a force on t...,2,[ Since the windshield exerts a force on the ...
1,FM_LV04_PR05.sh,Two hockey players pass a puck between them on...,What forces are acting on the puck while the p...,The normal force coming from the ice and the g...,correct(1)|correct_but_incomplete(0)|contradic...,\n1: The forces acting on the puck while it i...,1,[ The forces acting on the puck while it is b...
2,FM_LVxx_PR01,A rocket pushes a meteor with constant force. ...,Can you articulate Newton's second law?,"if there is a zero net force on the object, th...",correct(0)|correct_but_incomplete(0)|contradic...,\n1: Newton's 2nd Law says that the net force...,4,[ Newton's 2nd Law says that the net force is...
3,LP03_PR09.bLK.sh,"A car windshield collides with a mosquito, squ...",Can you articulate a principle or definition w...,An equal force always balancing it out regardl...,correct(0)|correct_but_incomplete(0)|contradic...,"\n1: For every action, there is an equal and ...",4,"[ For every action, there is an equal and opp..."
4,FM_LV04_PR05,Two hockey players pass a puck between them on...,"Based on Newton's first law, what can you say ...",The speed of the puck will equal to the net fo...,correct(0)|correct_but_incomplete(0)|contradic...,\n1: The puck will move in a straight line wi...,4,[ The puck will move in a straight line with ...
...,...,...,...,...,...,...,...,...
893,FM_LV03_PR07,A mover pushes a desk with a horizontal force ...,What can you say about the speed of the desk?,The speed of the desk will double because the ...,correct(0)|correct_but_incomplete(0)|contradic...,\n1: The desk moves with increasing velocity....,4,"[ The desk moves with increasing velocity., ..."
894,LP03_PR12.push.accel.bMLK,"While speeding up, a large truck pushes a smal...",How does Newton's third law apply to this situ...,since the truck is a force pushing upon the ca...,correct(0)|correct_but_incomplete(1)|contradic...,\n1: The force from the car on the truck and ...,2,[ The force from the car on the truck and the...
895,LP03_PR09.bLK,"A car windshield collides with a mosquito, squ...",Can you articulate a principle or definition w...,the action and reaction forces here have to be...,correct(0)|correct_but_incomplete(1)|contradic...,"\n1: For every action, there is an equal and ...",2,"[ For every action, there is an equal and opp..."
896,FM_LV04_PR05,Two hockey players pass a puck between them on...,Can you articulate Newton's first law?,an object will stay at rest or at constant vel...,correct(1)|correct_but_incomplete(0)|contradic...,\n1: An object at rest will stay at rest and ...,1,[ An object at rest will stay at rest and at ...


In [8]:
list(df_all_data.ReferenceAnswers)[0]

'\n1:  Since the windshield exerts a force on the mosquito, which we can call action, the mosquito exerts an equal and opposite force on the windshield, called the reaction.\n2:  The action  is the windshield squashing the mosquito, and the equal and opposite reaction is the mosquito hitting the windshield.\n3:  The force exerted by the windshield on the mosquito and the force exerted by the mosquito on the windshield are a third-law pair of action and reaction.\n4:  The force exerted by the windshield on the mosquito and the force exerted by the mosquito on the windshield are an action-reaction pair\n'

In [16]:
df_for_test = pd.DataFrame( {
  "Answer": ["An equal force always balancing","an object will stay at rest or at constant"],
  "ReferenceAnswers":["\n1:force balancing \n2:an object wil \n","\n1:hi\n2:there\n"]
})

 

df_for_test

Unnamed: 0,Answer,ReferenceAnswers
0,An equal force always balancing,\n1:force balancing \n2:an object wil \n
1,an object will stay at rest or at constant,\n1:hi\n2:there\n


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer(ngram_range=(1, 3))#, stop_words='english' )  
vectorizer.fit_transform(list(df_for_test.ReferenceAnswers))
def text_to_tfidf_Features(text:str):
    features_of_text=vectorizer.transform([text])
    features_matrix=features_of_text.T.todense()
    features_vector= np.asarray(features_matrix).reshape(-1)
    return features_vector
features_vector=text_to_tfidf_Features("hi")
type(features_vector)

numpy.ndarray

In [3]:
root=read_xml()
df_all_data=convert_xml_to_dataframe(root)
df_all_data=add_label_column_to_dataframe(df_all_data)
df_all_data=add_list_of_answers_as_new_columns(df_all_data)

Bert_tokenizer,Bert_model=import_Bert_model()

df_all_data=new_column_for_Answers_BERT_features(df_all_data)
df_all_data=new_column_for_ReferenceAnswers_BERT_features(df_all_data)
df_all_data=add_new_column_contain_highest_similarity(df_all_data)
df_all_data=concatenate_all_features(df_all_data)


In [6]:
train_features, test_features, train_labels, test_labels = train_test_split(list(df_all_data.all_features), list(df_all_data.Label),test_size=0.2)

classifier =RandomForestClassifier()
classifier.fit(train_features, train_labels)
predictions=classifier.predict(test_features)
score=f1_score(test_labels, predictions,  average='macro')
print("Random Forest Classifier F1 Score=",score)


Random Forest Classifier F1 Score= 0.9953668792293509
