In [28]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
en_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/darshan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
magpie_token_file = '../data/token_files/option1_idioms.csv'

In [5]:
df_magpie_tokens = pd.read_csv(magpie_token_file)
df_magpie_tokens.head()

Unnamed: 0,idiom,idiom_token
0,off the beaten track,IDoffthebeatentrackID
1,in the running,IDintherunningID
2,give someone the creeps,IDgivesomeonethecreepsID
3,do someone proud,IDdosomeoneproudID
4,take root,IDtakerootID


In [6]:
nctti_file = '../data/nctti/data_en.tsv'
df_nctti = pd.read_csv(nctti_file, sep='\t')
df_nctti.head()

Unnamed: 0,compound,CompScale,CompType,MeanS1,MeanS2,MeanS3,Synonyms,SynonymsS1,SynonymsS2,SynonymsS3
0,car park,PC,4.2,2.8,2.55,2.9,parking lot;parking lot;parking garage;vehicle...,garage,,
1,dream ticket,NC,1.32,1.7,1.8,1.9,perfect combination;golden ticket,,ideal,opportunity;chance
2,case study,C,3.7,3.6,4.0,3.08,example;specific example;medical trial;analysis,history;documentation,,
3,dutch courage,PC,1.0,0.8,0.8,0.7,alcohol;liquid courage;liquid courage,,hitting the bottle,
4,cash cow,NC,1.56,0.8,0.2,0.55,gold mine;money maker;moneymaker;moneymaker;st...,income,,


In [11]:
# Find commond PIEs with lowest character-distances

In [10]:
magpie_pie_list = df_magpie_tokens['idiom'].values
nctti_compounds = df_nctti['compound'].values

In [38]:
cnt=0
magpie_nctti_common_pairs = []
for mpie in magpie_pie_list:
    for nc_comp in nctti_compounds:
        mpie_words = mpie.split()
        # Remove stop words
        mpie_words = [mword for mword in mpie_words if mword not in en_stopwords]
        nc_comp_words = [ncword for ncword in nc_comp.split() if ncword not in en_stopwords]
        
        #Find the edit distance
        dist_score = nltk.edit_distance(mpie_words, nc_comp_words)
        # Get Similarity as Ratio
        sim_score = 1- (dist_score / (len(mpie_words)+0.01) )
    
        if sim_score > 0.55:
            print(sim_score, mpie, ',', nc_comp)
            cnt+=1
            # Consider this as matching idiom
            magpie_nctti_common_pairs.append( (mpie, nc_comp) )

print('-'*30)
print(f"Found {cnt} common idioms")

0.6677740863787376 keep a low profile , low profile
1.0 pecking order , pecking order
1.0 old hat , old hat
1.0 close call , close call
1.0 rock bottom , rock bottom
1.0 basket case , basket case
1.0 on cloud nine , cloud nine
0.6677740863787376 get in on the ground floor , ground floor
1.0 couch potato , couch potato
1.0 shrinking violet , shrinking violet
1.0 sitting duck , sitting duck
1.0 an old flame , old flame
1.0 banana republic , banana republic
------------------------------
Found 13 common idioms


In [47]:
# Obtain the list of all paraphrases (annotated synonyms) for these common idioms
list_of_paraphrases = set()
for mppie, nctti in magpie_nctti_common_pairs:
    df_com_row = df_nctti[df_nctti['compound'] == nctti]
    synonyms = df_com_row['Synonyms'].values[0].split(';')
    print(mppie, ':', synonyms)
    list_of_paraphrases.update(synonyms)

keep a low profile : ['low key', 'inconspicuous', 'inconspicuous', 'down low']
pecking order : ['hierarchy', 'hierarchy', 'food chain']
old hat : ['old fashioned', 'old fashioned', 'dated', 'old-fashioned', 'old-fashioned', 'out of date', 'old news', 'old news', 'uninteresting', 'sweet']
close call : ['near miss', 'close shave', 'cliffhanger', 'close one']
rock bottom : ['all time low', 'cheapest', 'lowest point', 'lowest point', 'absolute lowest']
basket case : ['crazy', 'crazy', 'nervous wreck', 'defenseless', 'lost cause']
on cloud nine : ['bliss', 'bliss', 'heaven', 'heaven', 'in heaven', 'top of the world', 'euphoria']
get in on the ground floor : ['first floor', 'first floor', 'ground level', 'bottom floor', 'bottom floor', 'basement', 'first story', 'sweet']
couch potato : ['lazy', 'lazy', 'lazy person', 'lazy person', 'inactive person', 'inactive person', 'sedentary individual']
shrinking violet : ['shy person', 'shy person', 'shy', 'shy', 'wallflower', 'wuss', 'wuss', 'sweet']

In [48]:
list_of_paraphrases

{'absolute lowest',
 'all time low',
 'basement',
 'bliss',
 'bottom floor',
 'cheapest',
 'cliffhanger',
 'close one',
 'close shave',
 'crazy',
 'dated',
 'defenseless',
 'down low',
 'easy prey',
 'easy target',
 'euphoria',
 'ex girlfriend',
 'exposed',
 'first floor',
 'first story',
 'food chain',
 'former lover',
 'ground level',
 'heaven',
 'helpless',
 'hierarchy',
 'in heaven',
 'inactive person',
 'inconspicuous',
 'land',
 'lazy',
 'lazy person',
 'lost cause',
 'low key',
 'lowest point',
 'near miss',
 'nervous wreck',
 'old fashioned',
 'old love',
 'old lover',
 'old news',
 'old-fashioned',
 'out of date',
 'past love',
 'politically unstable',
 'pushover',
 'sedentary individual',
 'shy',
 'shy person',
 'small nation',
 'sweet',
 'third world country',
 'top of the world',
 'uninteresting',
 'wallflower',
 'wuss'}