In [3]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict
import json

## Prepare the list of PIEs with different categories

* PIEs with high and low 'Degree of Idiomaticity'
 - 2 idiomatic, 2 literal

* Frequent and very-rare PIEs (CCNews basd rarity)
 - 2 idiomatic, 2 literal

* PIEs with high morphology and no morphology
 - 2 idiomatic, 2 literal

In [4]:
data_dir = '../data'
segregation_file = data_dir + '/PIE_segregation/pie_segregation.csv'

In [5]:
df_segregation = pd.read_csv(segregation_file)
df_segregation

Unnamed: 0,idiom_token,label,num_of_examples,total,idiomaticity_ratio,degree_of_idiomaticity,ccnews_rarity,morphology_type
0,IDonehorseraceID,idiom,4,4,1.000000,highly_idiomatic,ccnews_very_rare,no_morphology
1,IDonehorseraceID,literal,0,4,0.000000,highly_idiomatic,ccnews_very_rare,no_morphology
2,IDservesomeonerightID,idiom,68,69,0.985507,highly_idiomatic,ccnews_very_rare,moderate_morphology
3,IDservesomeonerightID,literal,1,69,0.014493,highly_idiomatic,ccnews_very_rare,moderate_morphology
4,IDallovertheplaceID,idiom,76,98,0.775510,ambiguous,ccnews_frequent,moderate_morphology
...,...,...,...,...,...,...,...,...
3471,IDcutthecrapID,literal,0,9,0.000000,highly_idiomatic,ccnews_moderately_rare,moderate_morphology
3472,IDpullafaceID,idiom,131,138,0.949275,highly_idiomatic,ccnews_very_rare,high_morphology
3473,IDpullafaceID,literal,7,138,0.050725,highly_idiomatic,ccnews_very_rare,high_morphology
3474,IDbleedingheartID,idiom,6,6,1.000000,highly_idiomatic,ccnews_moderately_rare,moderate_morphology


In [6]:
# Degree of Idiomaticity

In [38]:
# For visualizations
dfi = df_segregation[df_segregation['degree_of_idiomaticity'] == 'highly_idiomatic']
dfi = dfi[dfi['label'] == 'idiom'].sort_values('idiomaticity_ratio', ascending=False)
display(dfi.head(20))

Unnamed: 0,idiom_token,label,num_of_examples,total,idiomaticity_ratio,degree_of_idiomaticity,ccnews_rarity,morphology_type
0,IDonehorseraceID,idiom,4,4,1.0,highly_idiomatic,ccnews_very_rare,no_morphology
2182,IDwhenallissaidanddoneID,idiom,14,14,1.0,highly_idiomatic,ccnews_moderately_rare,moderate_morphology
2214,IDhighandmightyID,idiom,19,19,1.0,highly_idiomatic,ccnews_very_rare,moderate_morphology
2210,IDmanofgodID,idiom,22,22,1.0,highly_idiomatic,ccnews_frequent,moderate_morphology
2202,IDstaythecourseID,idiom,21,21,1.0,highly_idiomatic,ccnews_frequent,moderate_morphology
2196,IDgodownthewrongwayID,idiom,5,5,1.0,highly_idiomatic,ccnews_very_rare,moderate_morphology
2190,IDbreakranksID,idiom,35,35,1.0,highly_idiomatic,ccnews_moderately_rare,moderate_morphology
2188,IDsettleascoreID,idiom,16,16,1.0,highly_idiomatic,ccnews_moderately_rare,high_morphology
2184,IDdeadasadodoID,idiom,7,7,1.0,highly_idiomatic,ccnews_very_rare,moderate_morphology
2180,IDfromthecradletothegraveID,idiom,15,15,1.0,highly_idiomatic,ccnews_very_rare,moderate_morphology


In [39]:
# For visualizations
dfi = df_segregation[df_segregation['degree_of_idiomaticity'] == 'highly_literal']
dfi = dfi[dfi['label'] == 'literal'].sort_values('idiomaticity_ratio', ascending=False)
display(dfi.head(20))

Unnamed: 0,idiom_token,label,num_of_examples,total,idiomaticity_ratio,degree_of_idiomaticity,ccnews_rarity,morphology_type
77,IDtakethefifthID,literal,2,2,1.0,highly_literal,ccnews_moderately_rare,moderate_morphology
815,IDbuythefarmID,literal,25,25,1.0,highly_literal,ccnews_very_rare,high_morphology
1063,IDbiteyourlipID,literal,1,1,1.0,highly_literal,ccnews_very_rare,no_morphology
1097,IDmybadID,literal,9,9,1.0,highly_literal,ccnews_moderately_rare,no_morphology
1505,IDbeattherapID,literal,1,1,1.0,highly_literal,ccnews_moderately_rare,moderate_morphology
1639,IDshowalegID,literal,23,23,1.0,highly_literal,ccnews_very_rare,moderate_morphology
1643,IDblowoffsteamID,literal,1,1,1.0,highly_literal,ccnews_moderately_rare,moderate_morphology
1859,IDseetheelephantID,literal,5,5,1.0,highly_literal,ccnews_very_rare,moderate_morphology
1889,IDgostraightID,literal,145,145,1.0,highly_literal,ccnews_frequent,moderate_morphology
1923,IDcirclethewagonsID,literal,1,1,1.0,highly_literal,ccnews_very_rare,no_morphology


In [40]:
highly_idiomatic_PIEs = ["IDchaseyourtailID", "IDcastthefirststoneID", "IDstealtheshowID"]
highly_literal_PIEs = ["IDmybadID", "IDgameonID", "IDshowalegID", "IDgostraightID"]

In [9]:
# CommonCrawl News 'Rarity'

In [12]:
pie_counts_from_cc_news_file = '../experiments/exp3B_1/pretrain_data_split/cc_news_pie_counts.json'
with open(pie_counts_from_cc_news_file, 'r') as fin:
    pie_count_dict = json.load(fin)
    print(f"Loaded the pre-computed dict file from {pie_counts_from_cc_news_file}")

Loaded the pre-computed dict file from ../experiments/exp3B_1/pretrain_data_split/cc_news_pie_counts.json


In [16]:
print(sorted(pie_count_dict.items(), key=lambda p: p[1]))

[('IDbeatthedaylightsoutofID', 1), ('IDdaylightrobberyID', 1), ('IDclaimtofameID', 1), ('IDnotonyourlifeID', 1), ('IDsweatlikeapigID', 1), ('IDfromstemtosternID', 1), ('IDthanksfornothingID', 1), ('IDgoofftheboilID', 1), ('IDupagumtreeID', 1), ('IDlikeabearwithasoreheadID', 1), ('IDcakesandaleID', 1), ('IDlightattheendofthetunnelID', 1), ('IDonyoursoapboxID', 1), ('IDblowyourtopID', 1), ('IDpushingupthedaisiesID', 1), ('IDfootthebillID', 1), ('IDflyoffthehandleID', 1), ('IDneveryoumindID', 1), ('IDgetitintheneckID', 1), ('IDhaveascrewlooseID', 1), ('IDgodownlikealeadballoonID', 1), ('IDbarkupthewrongtreeID', 1), ('IDchuckitdownID', 1), ('IDgodownastormID', 1), ('IDputyourfootinitID', 1), ('IDputthewindupID', 1), ('IDventyourspleenID', 1), ('IDslipofthepenID', 1), ('IDhavekittensID', 1), ('IDpropupthebarID', 1), ('IDonthiniceID', 1), ('IDmakeamountainoutofamolehillID', 1), ('IDdownatheelID', 1), ('IDbytheskinofyourteethID', 1), ('IDgraspatstrawsID', 1), ('IDslowbutsureID', 1), ('IDhavet

In [24]:
both_freq_n_rare_PIEs = [
                        # very rare PIEs
                        "IDdaylightrobberyID", "IDputthewindupID", "IDonthiniceID", "IDontopoftheworldID",
                        "IDactofgodID", "IDmoneytalksID",
                        
                        # frequent PIEs
                        "IDbiteyourlipID", "IDseetheelephantID", "IDfromscratchID", "IDontherunID", "IDatseaID", "IDgameonID", "IDturnablindeyeID", 
                        "IDatyourfingertipsID", "IDlendahandID"]

In [25]:
df_rare_freq_pies = df_segregation[df_segregation['idiom_token'].isin(both_freq_n_rare_PIEs)]
for key, df_grp in df_rare_freq_pies.groupby('ccnews_rarity'):
    print(key)
    display(df_grp.sort_values('degree_of_idiomaticity'))

ccnews_frequent


Unnamed: 0,idiom_token,label,num_of_examples,total,idiomaticity_ratio,degree_of_idiomaticity,ccnews_rarity,morphology_type
2366,IDontherunID,idiom,59,104,0.567308,ambiguous,ccnews_frequent,high_morphology
2367,IDontherunID,literal,45,104,0.432692,ambiguous,ccnews_frequent,high_morphology
1126,IDfromscratchID,idiom,164,165,0.993939,highly_idiomatic,ccnews_frequent,moderate_morphology
1127,IDfromscratchID,literal,1,165,0.006061,highly_idiomatic,ccnews_frequent,moderate_morphology
1408,IDlendahandID,idiom,78,78,1.0,highly_idiomatic,ccnews_frequent,high_morphology
1409,IDlendahandID,literal,0,78,0.0,highly_idiomatic,ccnews_frequent,high_morphology
2110,IDatyourfingertipsID,idiom,18,18,1.0,highly_idiomatic,ccnews_frequent,no_morphology
2111,IDatyourfingertipsID,literal,0,18,0.0,highly_idiomatic,ccnews_frequent,no_morphology
3158,IDturnablindeyeID,idiom,119,119,1.0,highly_idiomatic,ccnews_frequent,high_morphology
3159,IDturnablindeyeID,literal,0,119,0.0,highly_idiomatic,ccnews_frequent,high_morphology


ccnews_very_rare


Unnamed: 0,idiom_token,label,num_of_examples,total,idiomaticity_ratio,degree_of_idiomaticity,ccnews_rarity,morphology_type
694,IDdaylightrobberyID,idiom,2,2,1.0,highly_idiomatic,ccnews_very_rare,no_morphology
695,IDdaylightrobberyID,literal,0,2,0.0,highly_idiomatic,ccnews_very_rare,no_morphology
1580,IDputthewindupID,idiom,7,7,1.0,highly_idiomatic,ccnews_very_rare,no_morphology
1581,IDputthewindupID,literal,0,7,0.0,highly_idiomatic,ccnews_very_rare,no_morphology
1962,IDmoneytalksID,idiom,11,11,1.0,highly_idiomatic,ccnews_very_rare,moderate_morphology
1963,IDmoneytalksID,literal,0,11,0.0,highly_idiomatic,ccnews_very_rare,moderate_morphology
2060,IDonthiniceID,idiom,9,10,0.9,highly_idiomatic,ccnews_very_rare,moderate_morphology
2061,IDonthiniceID,literal,1,10,0.1,highly_idiomatic,ccnews_very_rare,moderate_morphology
2416,IDontopoftheworldID,idiom,29,30,0.966667,highly_idiomatic,ccnews_very_rare,moderate_morphology
2417,IDontopoftheworldID,literal,1,30,0.033333,highly_idiomatic,ccnews_very_rare,moderate_morphology


In [26]:
frequent_idiomatic = ["IDfromscratchID", "IDlendahandID", "IDatyourfingertipsID", "IDturnablindeyeID"]
frequent_literal = ["IDgameonID", "IDatseaID"]

very_rare_idiomatic = ["IDdaylightrobberyID", "IDactofgodID", "IDontopoftheworldID", "IDmoneytalksID", "IDonthiniceID"]
very_rare_literal = ["IDbiteyourlipID", "IDseetheelephantID"]

In [27]:
# Morphological variations

In [35]:
# For visualizations
for morph_type, df_grp in df_segregation.groupby('morphology_type'):
    if morph_type != 'moderate_morphology':
        print(morph_type)
        df_grp = df_grp[df_grp['degree_of_idiomaticity'] != 'ambiguous']
        df_grp = df_grp.sort_values(['degree_of_idiomaticity'])
        display(df_grp.head(10))
        display(df_grp.tail(10))
        print('------------')

high_morphology


Unnamed: 0,idiom_token,label,num_of_examples,total,idiomaticity_ratio,degree_of_idiomaticity,ccnews_rarity,morphology_type
102,IDjumptoconclusionsID,idiom,97,97,1.0,highly_idiomatic,ccnews_moderately_rare,high_morphology
1869,IDrunthegauntletID,literal,0,33,0.0,highly_idiomatic,ccnews_moderately_rare,high_morphology
1886,IDbreaktheiceID,idiom,24,30,0.8,highly_idiomatic,ccnews_moderately_rare,high_morphology
3472,IDpullafaceID,idiom,131,138,0.949275,highly_idiomatic,ccnews_very_rare,high_morphology
2028,IDwhenthechipsaredownID,idiom,22,22,1.0,highly_idiomatic,ccnews_moderately_rare,high_morphology
2029,IDwhenthechipsaredownID,literal,0,22,0.0,highly_idiomatic,ccnews_moderately_rare,high_morphology
2056,IDdomeafavourID,idiom,34,41,0.829268,highly_idiomatic,ccnews_moderately_rare,high_morphology
2057,IDdomeafavourID,literal,7,41,0.170732,highly_idiomatic,ccnews_moderately_rare,high_morphology
2072,IDrocktheboatID,idiom,56,59,0.949153,highly_idiomatic,ccnews_frequent,high_morphology
1868,IDrunthegauntletID,idiom,33,33,1.0,highly_idiomatic,ccnews_moderately_rare,high_morphology


Unnamed: 0,idiom_token,label,num_of_examples,total,idiomaticity_ratio,degree_of_idiomaticity,ccnews_rarity,morphology_type
815,IDbuythefarmID,literal,25,25,1.0,highly_literal,ccnews_very_rare,high_morphology
910,IDonthemoneyID,idiom,3,111,0.027027,highly_literal,ccnews_frequent,high_morphology
911,IDonthemoneyID,literal,108,111,0.972973,highly_literal,ccnews_frequent,high_morphology
2209,IDintheclubID,literal,134,135,0.992593,highly_literal,ccnews_frequent,high_morphology
2208,IDintheclubID,idiom,1,135,0.007407,highly_literal,ccnews_frequent,high_morphology
2165,IDrunamileID,literal,32,39,0.820513,highly_literal,ccnews_moderately_rare,high_morphology
2164,IDrunamileID,idiom,7,39,0.179487,highly_literal,ccnews_moderately_rare,high_morphology
2151,IDunderthetableID,literal,114,124,0.919355,highly_literal,ccnews_moderately_rare,high_morphology
2698,IDonthelevelID,idiom,1,142,0.007042,highly_literal,ccnews_frequent,high_morphology
656,IDhearthingsID,idiom,4,61,0.065574,highly_literal,ccnews_moderately_rare,high_morphology


------------
no_morphology


Unnamed: 0,idiom_token,label,num_of_examples,total,idiomaticity_ratio,degree_of_idiomaticity,ccnews_rarity,morphology_type
0,IDonehorseraceID,idiom,4,4,1.0,highly_idiomatic,ccnews_very_rare,no_morphology
2309,IDcruisingforabruisingID,literal,0,1,0.0,highly_idiomatic,ccnews_very_rare,no_morphology
2310,IDholdyourtongueID,idiom,13,13,1.0,highly_idiomatic,ccnews_very_rare,no_morphology
2311,IDholdyourtongueID,literal,0,13,0.0,highly_idiomatic,ccnews_very_rare,no_morphology
2314,IDhighanddryID,idiom,38,38,1.0,highly_idiomatic,ccnews_very_rare,no_morphology
2315,IDhighanddryID,literal,0,38,0.0,highly_idiomatic,ccnews_very_rare,no_morphology
2318,IDfeedingfrenzyID,idiom,7,7,1.0,highly_idiomatic,ccnews_very_rare,no_morphology
2319,IDfeedingfrenzyID,literal,0,7,0.0,highly_idiomatic,ccnews_very_rare,no_morphology
2324,IDputthekiboshonID,idiom,3,3,1.0,highly_idiomatic,ccnews_moderately_rare,no_morphology
2325,IDputthekiboshonID,literal,0,3,0.0,highly_idiomatic,ccnews_moderately_rare,no_morphology


Unnamed: 0,idiom_token,label,num_of_examples,total,idiomaticity_ratio,degree_of_idiomaticity,ccnews_rarity,morphology_type
2700,IDindutchID,idiom,0,33,0.0,highly_literal,ccnews_frequent,no_morphology
1998,IDallwetID,idiom,1,24,0.041667,highly_literal,ccnews_moderately_rare,no_morphology
1096,IDmybadID,idiom,0,9,0.0,highly_literal,ccnews_moderately_rare,no_morphology
1097,IDmybadID,literal,9,9,1.0,highly_literal,ccnews_moderately_rare,no_morphology
1922,IDcirclethewagonsID,idiom,0,1,0.0,highly_literal,ccnews_very_rare,no_morphology
3126,IDinthedriversseatID,idiom,1,16,0.0625,highly_literal,ccnews_very_rare,no_morphology
3109,IDspitbloodID,literal,5,6,0.833333,highly_literal,ccnews_moderately_rare,no_morphology
3108,IDspitbloodID,idiom,1,6,0.166667,highly_literal,ccnews_moderately_rare,no_morphology
1999,IDallwetID,literal,23,24,0.958333,highly_literal,ccnews_moderately_rare,no_morphology
3127,IDinthedriversseatID,literal,15,16,0.9375,highly_literal,ccnews_very_rare,no_morphology


------------


In [37]:
high_morphology_idiomatic = ["IDbreaktheiceID", "IDjumptoconclusionsID", "IDdomeafavourID", "IDpullafaceID"]
high_morphology_literal = ["IDrunamileID", "IDunderthetableID", "IDbuythefarmID"]

no_morphology_idiomatic = ["IDonehorseraceID", "IDholdyourtongueID", "IDputthekiboshonID"]
no_morphology_literal = ["IDindutchID", "IDinthedriversseatID", "IDspitbloodID", "IDallwetID"]

In [47]:
#####################Final set of PIEs##########
print(f"highly_literal_PIEs: {highly_literal_PIEs}")
print(f"highly_idiomatic_PIEs: {highly_idiomatic_PIEs}")

print(f"frequent_idiomatic: {frequent_idiomatic}")
print(f"frequent_literal: {frequent_literal}")
print(f"very_rare_literal: {very_rare_literal}")
print(f"very_rare_idiomatic: {very_rare_idiomatic}")

print(f"high_morphology_idiomatic: {high_morphology_idiomatic}")
print(f"high_morphology_literal: {high_morphology_literal}")
print(f"no_morphology_idiomatic: {no_morphology_idiomatic}")
print(f"no_morphology_literal: {no_morphology_literal}")

highly_literal_PIEs: ['IDmybadID', 'IDgameonID', 'IDshowalegID', 'IDgostraightID']
highly_idiomatic_PIEs: ['IDchaseyourtailID', 'IDcastthefirststoneID', 'IDstealtheshowID']
frequent_idiomatic: ['IDfromscratchID', 'IDlendahandID', 'IDatyourfingertipsID', 'IDturnablindeyeID']
frequent_literal: ['IDgameonID', 'IDatseaID']
very_rare_literal: ['IDbiteyourlipID', 'IDseetheelephantID']
very_rare_idiomatic: ['IDdaylightrobberyID', 'IDactofgodID', 'IDontopoftheworldID', 'IDmoneytalksID', 'IDonthiniceID']
high_morphology_idiomatic: ['IDbreaktheiceID', 'IDjumptoconclusionsID', 'IDdomeafavourID', 'IDpullafaceID']
high_morphology_literal: ['IDrunamileID', 'IDunderthetableID', 'IDbuythefarmID']
no_morphology_idiomatic: ['IDonehorseraceID', 'IDholdyourtongueID', 'IDputthekiboshonID']
no_morphology_literal: ['IDindutchID', 'IDinthedriversseatID', 'IDspitbloodID', 'IDallwetID']


In [51]:
wordlist_set1 = {
    "PIE_list": [
        "IDmybadID", 
        "IDgameonID", 
        "IDchaseyourtailID",
        "IDstealtheshowID",
        "IDfromscratchID",
        "IDlendahandID",
        "IDatseaID",
        "IDbiteyourlipID",
        "IDdaylightrobberyID",
        "IDactofgodID",
        "IDbreaktheiceID",
        "IDrunamileID",
        "IDunderthetableID",
        "IDholdyourtongueID",
        "IDputthekiboshonID",
        "IDindutchID",
        "IDinthedriversseatID"
                ],
    
    "words":[]
}
wordlist_set1

{'PIE_list': ['IDmybadID',
  'IDgameonID',
  'IDchaseyourtailID',
  'IDstealtheshowID',
  'IDfromscratchID',
  'IDlendahandID',
  'IDatseaID',
  'IDbiteyourlipID',
  'IDdaylightrobberyID',
  'IDactofgodID',
  'IDbreaktheiceID',
  'IDrunamileID',
  'IDunderthetableID',
  'IDholdyourtongueID',
  'IDputthekiboshonID',
  'IDindutchID',
  'IDinthedriversseatID'],
 'paraphrases': ['contestready for something',
  'rush around ineffectually',
  'very busy',
  'center of attention',
  'outshinefrom very beginning',
  'assist',
  'help',
  'confused',
  'puzzled',
  'repress an emotion',
  'unfair trade',
  'victim',
  'severe natural event',
  'relieve tension',
  'start a conversation',
  'reluctant',
  'extremely unwilling',
  'secretly or covertly',
  'very drunk',
  'remain silent',
  'put an end to',
  'check',
  'curb',
  'stop',
  'in trouble',
  'in disfavor',
  'be in control',
  "in the driver's seat",
  'make the decisions'],
 'words': []}