In [39]:
import pandas as pd
import scipy
from scipy import stats

import sys
import sentence_transformers
from sentence_transformers import SentenceTransformer

import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize

from unidecode import unidecode

from sentence_transformers import SentenceTransformer, util

In [36]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [None]:
x_train = train_df.iloc[: , :-1]
x_train.head()

In [41]:
# Pre-process data
def pre_process(corpus):
    # convert input corpus to lower case.
    corpus = corpus.lower()
    # collecting a list of stop words from nltk and punctuation form
    # string class and create single array.
    stopset = stopwords.words('english') + list(string.punctuation)
    # remove stop words and punctuations from string.
    # word_tokenize is used to tokenize the input corpus in word tokens.
    corpus = " ".join([i for i in word_tokenize(corpus) if i not in stopset])
    # remove non-ascii characters
    corpus = unidecode(corpus)
    return corpus

train_df['target'] = train_df['target'].apply(pre_process)
train_df['anchor'] = train_df['anchor'].apply(pre_process)

In [42]:
# Grab the two columns we care about

input_anchor = train_df.anchor.to_list()
input_target = train_df.target.to_list()

In [44]:
# Compare models performance

def use_model(model_name, input_anchor, input_target):
    model = SentenceTransformer(model_name)
    anchor_vec = model.encode(input_anchor)
    target_vec = model.encode(input_target)
    cos_sim = []
    for i in range(len(anchor_vec)):
        sim = util.cos_sim(anchor_vec[i], target_vec[i])
        cos_sim.append(sim[0][0].item())
    cos_sim_model_score, p = scipy.stats.pearsonr(train_df.score, cos_sim)  
    dot_sim = []
    for i in range(len(anchor_vec)):
        dsim = util.dot_score(anchor_vec[i], target_vec[i])
        dot_sim.append(dsim[0][0].item())
    dot_sim_model_score, p = scipy.stats.pearsonr(train_df.score, dot_sim)
    return(cos_sim, cos_sim_model_score, dot_sim, dot_sim_model_score) 

In [45]:
# create an empty dataframe to hold model names and scores
model_comparisons = pd.DataFrame(columns = ['Model', 'Cos_Sim_Model_Score', 'Dot_Model_Score'])

In [46]:
# First model we're testing:
model_name = 'all-mpnet-base-v2'
cos_sim_function_results1, cos_sim_function_score, dot_sim_results1, dot_sim_model_score = use_model('sentence-transformers/' + model_name, input_anchor, input_target)
model_comparisons = model_comparisons.append({'Model': model_name, 'Cos_Sim_Model_Score': cos_sim_function_score, "Dot_Model_Score": dot_sim_model_score}, ignore_index=True)

Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 588kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 62.2kB/s]
Downloading: 100%|██████████| 10.1k/10.1k [00:00<00:00, 3.38MB/s]
Downloading: 100%|██████████| 571/571 [00:00<00:00, 114kB/s]
Downloading: 100%|██████████| 116/116 [00:00<00:00, 12.9kB/s]
Downloading: 100%|██████████| 39.3k/39.3k [00:00<00:00, 278kB/s]
Downloading: 100%|██████████| 349/349 [00:00<00:00, 117kB/s]
Downloading: 100%|██████████| 438M/438M [02:54<00:00, 2.52MB/s] 
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 8.85kB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 79.6kB/s]
Downloading: 100%|██████████| 466k/466k [00:01<00:00, 371kB/s]  
Downloading: 100%|██████████| 363/363 [00:00<00:00, 181kB/s]
Downloading: 100%|██████████| 13.1k/13.1k [00:00<00:00, 93.0kB/s]
Downloading: 100%|██████████| 232k/232k [00:01<00:00, 230kB/s]  
loading configuration file C:\Users\I514609/.cache\torch\sentence_transformers\sentence-transformers_all-mpnet

KeyboardInterrupt: 