# Create idiom lexicon with scraped definitions

In [37]:
from bs4 import BeautifulSoup
import requests 
import re
import pandas as pd
from tqdm import tqdm

In [3]:
# read in SLIDE idiom lexicon
idiom_lexicon = pd.read_csv('idiomLexicon.tsv', sep='\t')
idiom_lexicon

Unnamed: 0,Idiom,WiktionaryURL,Pos,Neg,Neu,Inapprop.,Total,%Pos,%Neg,%Neu,Maj. Label,FilterOut(X)
0,American Dream,https://en.wiktionary.org/wiki/American_Dream,8,0,2,0,10,0.8,0.0,0.2,positive,
1,Catch-22,https://en.wiktionary.org/wiki/Catch-22,0,7,3,0,10,0.0,0.7,0.3,negative,
2,Christmas present,https://en.wiktionary.org/wiki/Christmas_present,6,0,4,0,10,0.6,0.0,0.4,positive,
3,Downing Street,https://en.wiktionary.org/wiki/Downing_Street,0,0,10,0,10,0.0,0.0,1.0,neutral,
4,Dutch courage,https://en.wiktionary.org/wiki/Dutch_courage,2,2,6,0,10,0.2,0.2,0.6,neutral,
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,your man,https://en.wiktionary.org/wiki/your_man,1,0,9,0,10,0.1,0.0,0.9,neutral,
4996,yours truly,https://en.wiktionary.org/wiki/yours_truly,1,0,9,0,10,0.1,0.0,0.9,neutral,
4997,zero in on,https://en.wiktionary.org/wiki/zero_in_on,1,0,9,0,10,0.1,0.0,0.9,neutral,
4998,zero-day,https://en.wiktionary.org/wiki/zero-day,0,4,6,0,10,0.0,0.4,0.6,neutral,


## Scrape definitions from Wiktionary

In [51]:
# for i, row in idiom_lexicon.iterrows():  # for testing, because tqdm does overrides prints
for i, row in tqdm(idiom_lexicon.iterrows(), total=len(idiom_lexicon)):
    # if row['definition'] != 'nan':
    #     continue
    # print(row['Idiom'], end=': ')
    url = row['WiktionaryURL'] #'https://en.wiktionary.org/wiki/Oreo_cookie'
    req = requests.get(url)
    soup = BeautifulSoup(req.text, 'html.parser')

    sections = soup.find_all('span', {'class': 'ib-content'}) #mw-content-text > div.mw-parser-output > ol > li:nth-child(1) > span.ib-content > a
    # get all sections that include 'idiomatic'
    idiomatic_sections = [section for section in sections if 'idiomatic' in section.text]
    # print(idiomatic_sections)

    # get text of parent section of each idiomatic section, using only first text found
    texts = [section.find_parent().text for section in idiomatic_sections]
    texts = [text.split('\n')[0] for text in texts]
    # remove (idiomatic) from text
    texts = [re.sub(r'\(.*idiomatic.*\) ', '', text) for text in texts]
    # concatenate all found definitions
    texts = ' '.join(texts)
    # print(texts)
    # print('---')

    # add to dataframe
    idiom_lexicon.at[i, 'definition'] = texts

    # save
    idiom_lexicon.to_csv('idiom_lexicon_scraped.csv', index=False)
    # break

100%|██████████| 5000/5000 [46:06<00:00,  1.81it/s]  


In [102]:
idiom_lexicon[(idiom_lexicon.definition.isna()) & (idiom_lexicon.idiom_id <= 2029)]

Unnamed: 0,Idiom,WiktionaryURL,Pos,Neg,Neu,Inapprop.,Total,%Pos,%Neg,%Neu,Maj. Label,FilterOut(X),definition,sentiment,idiom_id
2,Christmas present,https://en.wiktionary.org/wiki/Christmas_present,6,0,4,0,10,0.6,0.0,0.4,positive,,,positive,2
3,Downing Street,https://en.wiktionary.org/wiki/Downing_Street,0,0,10,0,10,0.0,0.0,1.0,neutral,,,other,3
12,Great Britain and Ireland,https://en.wiktionary.org/wiki/Great_Britain_a...,1,0,9,0,10,0.1,0.0,0.9,neutral,,,other,9
26,John Doe,https://en.wiktionary.org/wiki/John_Doe,0,0,10,0,10,0.0,0.0,1.0,neutral,,,other,20
29,John Thomas,https://en.wiktionary.org/wiki/John_Thomas,0,0,6,4,10,0.0,0.0,0.6,neutral,,,other,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2439,labour of love,https://en.wiktionary.org/wiki/labour_of_love,5,2,3,0,10,0.5,0.2,0.3,positive,,,positive,2010
2441,ladies and gentlemen,https://en.wiktionary.org/wiki/ladies_and_gent...,0,0,10,0,10,0.0,0.0,1.0,neutral,,,other,2012
2449,larger than life,https://en.wiktionary.org/wiki/larger_than_life,7,1,2,0,10,0.7,0.1,0.2,positive,,,positive,2018
2455,last trump,https://en.wiktionary.org/wiki/last_trump,0,0,10,0,10,0.0,0.0,1.0,neutral,,,other,2023


## Create sentiment column

In [32]:
idiom_lexicon = pd.read_csv('idiom_lexicon_scraped.csv')
idiom_lexicon.rename(columns={'Maj. Label': 'sentiment'}, inplace=True)

sentiment_map = {
    'positive': 'positive',
    'negative': 'negative',
    'neutral': 'other'
}

idiom_lexicon['sentiment'] = idiom_lexicon['sentiment'].map(sentiment_map)
print(idiom_lexicon['sentiment'].value_counts())
idiom_lexicon.to_csv('idiom_lexicon_scraped.csv', index=False)
idiom_lexicon

sentiment
other       2945
negative    1108
positive     946
Name: count, dtype: int64


## Adding idiom ids

In [88]:
idiom_lexicon = pd.read_csv('idiom_lexicon_scraped.csv')
idiom_lexicon['idiom_id'] = pd.NA
dataset = pd.read_csv('../Sentence Generation/Datasets/full_dataset.csv', index_col=0)
# drop non unique idioms
dataset = dataset.drop_duplicates(subset=['idiom'])
dataset

Unnamed: 0,idiom_id,idiom,sentence,emotion
0,0,American Dream,"Sally finally achieved the American Dream, liv...",Happiness
5,1,Catch-22,Stuck in this bureaucratic mess feels like a C...,Anxiety
10,2,Christmas present,"Every year, my aunt surprises us with unexpect...",Surprise
15,3,Downing Street,The new employee quickly found his way to Down...,Admiration
20,4,Dutch courage,"After a few shots of Dutch courage, he finally...",Excitement
...,...,...,...,...
10125,2025,last-ditch,"In a last-ditch effort to save their marriage,...",Hope
10130,2026,latch onto,She quickly latched onto the idea of a weekend...,Excitement
10135,2027,late bloomer,"Despite being a late bloomer, she eventually s...",Pride
10140,2028,late model,"I just bought a late model car, and I can't hi...",Happiness


In [90]:
# match idioms in dataset to idioms in idiom lexicon
for i, row in tqdm(dataset.iterrows(), total=len(dataset)):
    idiom = row['idiom']
    # find idiom in idiom lexicon
    idiom_lexicon_row = idiom_lexicon[idiom_lexicon['Idiom'] == idiom].index
    if len(idiom_lexicon_row) == 0:
        continue
    idiom_lexicon_row = idiom_lexicon_row[0]
    idiom_lexicon.at[idiom_lexicon_row, 'idiom_id'] = row['idiom_id']

idiom_lexicon.to_csv('idiom_lexicon_scraped.csv', index=False)
idiom_lexicon

100%|██████████| 2030/2030 [00:00<00:00, 2600.09it/s]


Unnamed: 0,Idiom,WiktionaryURL,Pos,Neg,Neu,Inapprop.,Total,%Pos,%Neg,%Neu,Maj. Label,FilterOut(X),definition,sentiment,idiom_id
0,American Dream,https://en.wiktionary.org/wiki/American_Dream,8,0,2,0,10,0.8,0.0,0.2,positive,,A widespread determination by Americans to pro...,positive,0
1,Catch-22,https://en.wiktionary.org/wiki/Catch-22,0,7,3,0,10,0.0,0.7,0.3,negative,,A difficult situation from which there is no e...,negative,1
2,Christmas present,https://en.wiktionary.org/wiki/Christmas_present,6,0,4,0,10,0.6,0.0,0.4,positive,,,positive,2
3,Downing Street,https://en.wiktionary.org/wiki/Downing_Street,0,0,10,0,10,0.0,0.0,1.0,neutral,,,other,3
4,Dutch courage,https://en.wiktionary.org/wiki/Dutch_courage,2,2,6,0,10,0.2,0.2,0.6,neutral,,The courage or bravado induced by alcohol. An ...,other,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,your man,https://en.wiktionary.org/wiki/your_man,1,0,9,0,10,0.1,0.0,0.9,neutral,,,other,
4996,yours truly,https://en.wiktionary.org/wiki/yours_truly,1,0,9,0,10,0.1,0.0,0.9,neutral,,"Used to close a note or letter. I, me, or myself.",other,
4997,zero in on,https://en.wiktionary.org/wiki/zero_in_on,1,0,9,0,10,0.1,0.0,0.9,neutral,,To converge (on).,other,
4998,zero-day,https://en.wiktionary.org/wiki/zero-day,0,4,6,0,10,0.0,0.4,0.6,neutral,,"newly discovered, and therefore still not fixe...",other,


## (Scraping evaluation)

In [2]:
import pandas as pd
idiom_lexicon = pd.read_csv('idiom_lexicon_scraped.csv')

idiom_lexicon['definition'].isna().sum()

1168

In [7]:
len(idiom_lexicon)

5000

In [65]:
idiom_lexicon[idiom_lexicon['Idiom'] == 'jot down']

Unnamed: 0,Idiom,WiktionaryURL,Pos,Neg,Neu,Inapprop.,Total,%Pos,%Neg,%Neu,Maj. Label,FilterOut(X),definition,sentiment
2325,jot down,https://en.wiktionary.org/wiki/jot_down,0,0,10,0,10,0.0,0.0,1.0,neutral,,to write down hurriedly; to make a note of,other
