In [1]:
! pip install jiwer



In [2]:
import pandas as pd
import ast
import csv

In [3]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from jiwer import wer


In [4]:

data = pd.read_csv('/content/qna_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,User Query,FAISS Search,ANN Search,Bert Similarity,original_answers
0,0,Design load calculation method,[['load combination Engineering judgment shall...,[['gas duct sizing calculation Critical pipe s...,"['load combination', 'load combination', 'stru...",ASCE/SEI 7-05
1,1,Structural design software and version,[['DIPWP PROJECT Document Title DESIGN CRITERI...,[['DIPWP PROJECT Document Title DESIGN CRITERI...,"['Ultimate Strength Design', 'BS 8102 Code', '...",STAAD Pro V8
2,2,Structural design life,[['Title DESIGN CRITERIA DOCUMENT CIVIL STRUCT...,[['Celsius C 5 DESIGN LIFE All civil structura...,"['foundations', 'Ultimate Strength Design', 'D...",30 years
3,3,Basic wind speed,[['determined by the basic equation P qzGCP qi...,[['than 60 of operating speed For pump unit ha...,"['Non Building Structures', 'Non Building Stru...",55ms
4,4,period of basic wind speed,[['7 05 V Basic Wind Speed m/s Wind Force Buil...,[['times and loading rates of power units and ...,"['m/s', 'm/s', 'Full load rejection tests to m...",3 sec gust


In [5]:
new_df = data[['User Query','Bert Similarity','original_answers']]

In [6]:
new_df.head(3)

Unnamed: 0,User Query,Bert Similarity,original_answers
0,Design load calculation method,"['load combination', 'load combination', 'stru...",ASCE/SEI 7-05
1,Structural design software and version,"['Ultimate Strength Design', 'BS 8102 Code', '...",STAAD Pro V8
2,Structural design life,"['foundations', 'Ultimate Strength Design', 'D...",30 years


In [7]:

for i in range(len(new_df['Bert Similarity'])):
  new_df['Bert Similarity'][i] = ast.literal_eval(new_df['Bert Similarity'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
new_df['Bert Similarity'][10]

['Flexible coupling between blower and motor',
 'slide rails',
 'Carbon Steel Bolts',
 'Structural Bolts']

In [9]:
final_df = new_df.explode('Bert Similarity')

In [10]:
final_df.head(10)

Unnamed: 0,User Query,Bert Similarity,original_answers
0,Design load calculation method,load combination,ASCE/SEI 7-05
0,Design load calculation method,load combination,ASCE/SEI 7-05
0,Design load calculation method,structural calculation with seismic load calcu...,ASCE/SEI 7-05
0,Design load calculation method,strength and stresses,ASCE/SEI 7-05
1,Structural design software and version,Ultimate Strength Design,STAAD Pro V8
1,Structural design software and version,BS 8102 Code,STAAD Pro V8
1,Structural design software and version,Ultimate Strength Design,STAAD Pro V8
1,Structural design software and version,in house developed worksheets,STAAD Pro V8
2,Structural design life,foundations,30 years
2,Structural design life,Ultimate Strength Design,30 years


In [11]:
final_df.reset_index(inplace=True)

In [12]:
final_df = final_df.drop("index", axis = 1)

In [13]:
final_df.head(10)

Unnamed: 0,User Query,Bert Similarity,original_answers
0,Design load calculation method,load combination,ASCE/SEI 7-05
1,Design load calculation method,load combination,ASCE/SEI 7-05
2,Design load calculation method,structural calculation with seismic load calcu...,ASCE/SEI 7-05
3,Design load calculation method,strength and stresses,ASCE/SEI 7-05
4,Structural design software and version,Ultimate Strength Design,STAAD Pro V8
5,Structural design software and version,BS 8102 Code,STAAD Pro V8
6,Structural design software and version,Ultimate Strength Design,STAAD Pro V8
7,Structural design software and version,in house developed worksheets,STAAD Pro V8
8,Structural design life,foundations,30 years
9,Structural design life,Ultimate Strength Design,30 years


In [14]:
final_df.shape[0]

80

# Consine prediction

In [15]:
def cosine_metric(dataFrame):
  cosine_score = []
  cv = CountVectorizer(max_features=500)
  for i in range(dataFrame.shape[0]):
    answers = [dataFrame.iloc[i,1], dataFrame.iloc[i,2]]
  
    vectors = cv.fit_transform(answers).toarray()
    similarity = cosine_similarity(vectors)
    cosine_score.append(similarity[1][0])
  return cosine_score





In [16]:
len(cosine_metric(final_df))

80

# WER prediction

In [17]:
def wer_metric(dataFrame):
  from jiwer import wer
  wer_score = []
  for i in range(dataFrame.shape[0]):
    predicted = dataFrame.iloc[i,1]
    original = dataFrame.iloc[i,2]
    similarity = 1 - wer(original, predicted)
    wer_score.append(similarity)
  return wer_score

In [18]:
len(wer_metric(final_df))

80

# F1 Score prediction

In [19]:
def F1_metric(dataFrame):
  F1score = []
  for i in range(dataFrame.shape[0]):
    pred_tokens = dataFrame.iloc[i,1].split()
    truth_tokens = dataFrame.iloc[i,2].split()
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
      F1score.append(int(pred_tokens == truth_tokens))
    common_tokens = set(pred_tokens) & set(truth_tokens)
    if len(common_tokens) == 0:
        F1score.append(0)
    else:
      prec = len(common_tokens) / len(pred_tokens)
      rec = len(common_tokens) / len(truth_tokens)
      F1score.append(2 * (prec * rec) / (prec + rec))
  return F1score

In [20]:
len(F1_metric(final_df))

80

# Metric Evaluation

In [25]:
def get_evaluation_results(dataFrame, metric_names):


  
   
  


  for metric_name in metric_names:
    if metric_name == 'cosine':
      result = cosine_metric(dataFrame)
      dataFrame['cosine_result'] = result

    if metric_name == 'wer':
      result =  wer_metric(dataFrame)
      dataFrame['wer_result'] = result

    if metric_name == "F1Score":
      result =  F1_metric(dataFrame)

      dataFrame['F1Score_result'] = result
   

  return dataFrame

  

  
  
  

In [26]:
dataFrame = final_df
metric_names = ['cosine', 'wer', 'F1Score']

In [27]:
get_evaluation_results(dataFrame, metric_names)

Unnamed: 0,User Query,Bert Similarity,original_answers,cosine_result,wer_result,F1Score_result
0,Design load calculation method,load combination,ASCE/SEI 7-05,0.000000,0.000000,0.00
1,Design load calculation method,load combination,ASCE/SEI 7-05,0.000000,0.000000,0.00
2,Design load calculation method,structural calculation with seismic load calcu...,ASCE/SEI 7-05,0.000000,-2.000000,0.00
3,Design load calculation method,strength and stresses,ASCE/SEI 7-05,0.000000,-0.500000,0.00
4,Structural design software and version,Ultimate Strength Design,STAAD Pro V8,0.000000,0.000000,0.00
...,...,...,...,...,...,...
75,"Non headed Anchor Bolts, Studs orThreaded Bars",all anchor bolts nuts and washers shall be Hot...,AISC sections conforming to ASTM A36 or ASTM A...,0.000000,0.000000,0.00
76,Galvanizing standard,ASTM A234 WPB / IS 1239,galvanized in accordance with BSI BS EN ISO 14...,0.091287,0.052632,0.08
77,Galvanizing standard,EPDM/ Teflon Butt welded 8,galvanized in accordance with BSI BS EN ISO 14...,0.000000,0.000000,0.00
78,Galvanizing standard,High Temperature or High Pressure Service,galvanized in accordance with BSI BS EN ISO 14...,0.144338,0.052632,0.08


In [34]:
repeat_mul = len(final_df)


80

In [33]:
index = [1,2,3,4] * 20

In [35]:
final_df['predicted_ans_index'] = index

In [36]:
final_df.head(20)

Unnamed: 0,User Query,Bert Similarity,original_answers,cosine_result,wer_result,F1Score_result,predicted_ans_index
0,Design load calculation method,load combination,ASCE/SEI 7-05,0.0,0.0,0.0,1
1,Design load calculation method,load combination,ASCE/SEI 7-05,0.0,0.0,0.0,2
2,Design load calculation method,structural calculation with seismic load calcu...,ASCE/SEI 7-05,0.0,-2.0,0.0,3
3,Design load calculation method,strength and stresses,ASCE/SEI 7-05,0.0,-0.5,0.0,4
4,Structural design software and version,Ultimate Strength Design,STAAD Pro V8,0.0,0.0,0.0,1
5,Structural design software and version,BS 8102 Code,STAAD Pro V8,0.0,0.0,0.0,2
6,Structural design software and version,Ultimate Strength Design,STAAD Pro V8,0.0,0.0,0.0,3
7,Structural design software and version,in house developed worksheets,STAAD Pro V8,0.0,-0.333333,0.0,4
8,Structural design life,foundations,30 years,0.0,0.0,0.0,1
9,Structural design life,Ultimate Strength Design,30 years,0.0,-0.5,0.0,2


In [37]:
final_df = final_df[['User Query','Bert Similarity','Bert Similarity','original_answers','predicted_ans_index','cosine_result','wer_result','F1Score_result']]

In [38]:
final_df.head(10)

Unnamed: 0,User Query,Bert Similarity,Bert Similarity.1,original_answers,predicted_ans_index,cosine_result,wer_result,F1Score_result
0,Design load calculation method,load combination,load combination,ASCE/SEI 7-05,1,0.0,0.0,0.0
1,Design load calculation method,load combination,load combination,ASCE/SEI 7-05,2,0.0,0.0,0.0
2,Design load calculation method,structural calculation with seismic load calcu...,structural calculation with seismic load calcu...,ASCE/SEI 7-05,3,0.0,-2.0,0.0
3,Design load calculation method,strength and stresses,strength and stresses,ASCE/SEI 7-05,4,0.0,-0.5,0.0
4,Structural design software and version,Ultimate Strength Design,Ultimate Strength Design,STAAD Pro V8,1,0.0,0.0,0.0
5,Structural design software and version,BS 8102 Code,BS 8102 Code,STAAD Pro V8,2,0.0,0.0,0.0
6,Structural design software and version,Ultimate Strength Design,Ultimate Strength Design,STAAD Pro V8,3,0.0,0.0,0.0
7,Structural design software and version,in house developed worksheets,in house developed worksheets,STAAD Pro V8,4,0.0,-0.333333,0.0
8,Structural design life,foundations,foundations,30 years,1,0.0,0.0,0.0
9,Structural design life,Ultimate Strength Design,Ultimate Strength Design,30 years,2,0.0,-0.5,0.0
