In [10]:
! pip install jiwer



In [11]:
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from nltk.translate.bleu_score import sentence_bleu
from jiwer import wer

# Cosine Similarity

1. Cosine similarity is a metric used to determine how similar the documents are irrespective of their size

2. It measures the cosine of the angle between two vectors projected in a multi-dimensional space
3.The two vectors are arrays containing the word counts of two documents.
4. Lower the angle means higher the similarity




In [12]:
def cosine(answer, prediction):
  '''
  input : actual answer, and predicted answer
  output: cosine similarity value

  '''
  cv = CountVectorizer(max_features=500)
  answers = [answer, prediction]
  vectors = cv.fit_transform(answers).toarray()
  similarity = cosine_similarity(vectors)
  return similarity[1][0]

In [13]:
answer1 = 'specification for Ultimate Strength'
prediction1 = 'sizing calculation structural'
cosine(answer1, prediction1)

0.0

In [14]:
answer2 = 'AIC318M 08'
prediction2 = ' AIC318M 08'
cosine(answer2, prediction2)

0.9999999999999998

In [15]:
answer3 = '30 years'
prediction3 = 'Design life of 30 years 6'
cosine(answer3, prediction3)

0.6324555320336758

In [16]:
answer4 = '1Kzt'
prediction4 = 'Kzt 1 0'
cosine(answer4, prediction4)

0.0

# Sequence Matcher 

1.  It is  used for comparing pairs of input sequences.
2. The basic idea is to find the longest contiguous matching subsequence.
3. SequenceMatcher tries to find out the output which is more human-friendly.
4. Working : Given two input strings a and b,
> a. ratio( ) returns the similarity score ( float in [0,1] ) between input strings

>>   b. It sums the sizes of all matched sequences

>>   c. calculates the ratio as: ratio = 2.0*M / T 

>>  d. where M = matches , T = total number of elements in both sequences

In [17]:
def sequence_matcher(answer, prediction):
  
  ratio = SequenceMatcher(None, answer, prediction).ratio()
  return ratio
  

In [18]:
answer1 = 'specification for Ultimate Strength'
prediction1 = 'sizing calculation structural'
sequence_matcher(answer1, prediction1)

0.375

In [19]:
answer2 = 'AIC318M 08'
prediction2 = ' AIC318M 08'
sequence_matcher(answer2, prediction2)

0.9523809523809523

In [20]:
answer3 = '30 years'
prediction3 = 'Design life of 30 years 6'
sequence_matcher(answer3, prediction3)

0.48484848484848486

In [21]:
answer4 = '1Kzt'
prediction4 = 'Kzt 1 0'
sequence_matcher(answer4, prediction4)

0.5454545454545454

# F1 Score

1. **F1-score** is primarily used to compare the performance of two classifiers.
2. F1-score calculated as follows:
>>                      2(P*R)/P+R


>> P = the precision

>> R = the recall 

In [98]:
def f1_score(answer, prediction):

  '''
    input: actuall answer, predicted answer'
    output: F1 Score
  '''

  
  answer_tokens = answer.split()
  prediction_tokens = prediction.split()
  if len(prediction_tokens) == 0 or len(answer_tokens)==0:
    fi_pred = (int(prediction_tokens == answer_tokens))
    
  common_tokens = set(prediction_tokens) & set(answer_tokens)
  if len(common_tokens) == 0:
    fi_pred = 0 
    
  else:
    precision = len(common_tokens) / len(prediction_tokens)
    recall = len(common_tokens) / len(answer_tokens)
    fi_pred = (2 * (precision * recall) / (precision + recall))
  return fi_pred


In [99]:
answer1 = 'specification for Ultimate Strength'
prediction1 = 'sizing calculation structural'
f1_score(answer1, prediction2)


0

In [100]:
answer2 = 'AIC318M 08'
prediction2 = ' AIC318M 08'
f1_score(answer2, prediction2)

1.0

In [101]:
answer3 = '30 years'
prediction3 = 'Design life of 30 years 6'
f1_score(answer3, prediction3)

0.5

In [102]:
answer4 = '1Kzt'
prediction4 = 'Kzt 1 0'
f1_score(answer4, prediction4)

0

# Word Error Rate

1. WER is the number of errors divided by the total words.
2. To get the WER, start by adding up the substitutions, insertions, and deletions that occur in a sequence of recognized words. Divide that number by the total number of words in original document. The result is the WER
3. Word Error Rate = (Substitutions + Insertions + Deletions) / Number of Words Spoken
>>**Substitution**: occurs when a word gets replaced (for example, “noose” is transcribed as “moose”)

>>>   **Insertion**:  when a word is added that wasn’t in actual document (for example, “SAT” becomes “essay tea”)

>>> **Deletion** happens when a word is left out from the predicted context completely (for example, “turn it around” becomes “turn around”)
4. Lower WER often indicates that prediction is more accurate.

In [27]:
def word_error_rate(answer, prediction):

  '''
  input : actual answer, predicted answer
  output : 
  
  '''
  
  similarity = 1 - wer(answer, prediction)
  return similarity

In [28]:
answer1 = 'specification for Ultimate Strength'
prediction1 = 'sizing calculation structural'
word_error_rate(answer1, prediction1)

0.0

In [29]:
answer2 = 'AIC318M 08'
prediction2 = ' AIC318M 08'
word_error_rate(answer2, prediction2)

1.0

In [30]:
answer3 = '30 years'
prediction3 = 'Design life of 30 years 6'
word_error_rate(answer3, prediction3)

-1.0

In [31]:
answer4 = '1Kzt'
prediction4 = 'Kzt 1 0'
word_error_rate(answer4, prediction4)

-2.0

# BLEU Score

1. The approach works by counting matching n-grams in the candidate translation to n-grams in the reference text
2.  where 1-gram or unigram would be each token and a bigram comparison would be each word pair. The comparison is made regardless of word order.
3. Indicates how similar the candidate text is to the reference texts, with values closer to one representing more similar texts.
4. BLEU compares the n-gram of the candidate translation with n-gram of the reference translation to count the number of matches

In [80]:
def bleuScore(answer, prediction):

  score = sentence_bleu(answer, prediction, weights = (0.5, 0.5))
  return score



In [81]:
answer1 = 'specification for Ultimate Strength'
prediction1 = 'sizing calculation structural'
bleuScore(answer1, prediction1)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.6158817620514396

In [82]:
answer2 = 'AIC318M 08'
prediction2 = 'AIC318M 08'
bleuScore(answer2, prediction2)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.9486832980505138

In [83]:
answer3 = '30 years'
prediction3 = 'Design life of 30 years 6'
bleuScore(answer3, prediction3)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.565685424949238

In [84]:
answer4 = '1Kzt'
prediction4 = 'Kzt 1 0'
bleuScore(answer4, prediction4)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.7559289460184544

# Comparison over results

In [89]:
answer_list = ['specification for Ultimate Strength','AIC318M 08','30 years', '1Kzt']
predicted_list = ['sizing calculation structural' , ' AIC318M 08', 'Design life of 30 years 6', 'Kzt 1 0 ']
Consine_similarity = [0.0,0.99, 0.63, 0.0]
Sequence_matcher = [0.37, 0.95, 0.48, 0.54]
F1_Score = [0.0, 1.0, 0.5, 0.0]
wer_score = [0.0, 1.0, -1.0, -2.0]
bleu_score = [0.61, 0.94, 0.56, 0.75]

In [90]:
import pandas as pd

In [96]:
df = pd.DataFrame(list(zip(answer_list, predicted_list, Consine_similarity, Sequence_matcher, F1_Score, wer_score, bleu_score )),
               columns =['actual_answer', 'predicted_answer', 'consine_similarity', 'sequence_matcher', 'F1_score', "wer_score", 'bleu_score'])


In [97]:
df.head()


Unnamed: 0,actual_answer,predicted_answer,consine_similarity,sequence_matcher,F1_score,wer_score,bleu_score
0,specification for Ultimate Strength,sizing calculation structural,0.0,0.37,0.0,0.0,0.61
1,AIC318M 08,AIC318M 08,0.99,0.95,1.0,1.0,0.94
2,30 years,Design life of 30 years 6,0.63,0.48,0.5,-1.0,0.56
3,1Kzt,Kzt 1 0,0.0,0.54,0.0,-2.0,0.75
