# Lemmatize idioms from SLIDE dataset

In [1]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /home/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/alex/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
data = pd.read_csv('../SLIDE_dataset/idiomLexicon.tsv', sep='\t')
data.head()

Unnamed: 0,Idiom,WiktionaryURL,Pos,Neg,Neu,Inapprop.,Total,%Pos,%Neg,%Neu,Maj. Label,FilterOut(X)
0,American Dream,https://en.wiktionary.org/wiki/American_Dream,8,0,2,0,10,0.8,0.0,0.2,positive,
1,Catch-22,https://en.wiktionary.org/wiki/Catch-22,0,7,3,0,10,0.0,0.7,0.3,negative,
2,Christmas present,https://en.wiktionary.org/wiki/Christmas_present,6,0,4,0,10,0.6,0.0,0.4,positive,
3,Downing Street,https://en.wiktionary.org/wiki/Downing_Street,0,0,10,0,10,0.0,0.0,1.0,neutral,
4,Dutch courage,https://en.wiktionary.org/wiki/Dutch_courage,2,2,6,0,10,0.2,0.2,0.6,neutral,


In [3]:
def full_lemmatize(word):
    # remove all non-alphanumeric characters
    word = re.sub('[^0-9a-zA-Z ]+', '', word)
    word = re.sub('[a-zA-Z]*self', 'self', word)
    word = word.lower()
    
    lemmatizer = WordNetLemmatizer()
    for pos in ['v', 'n', 'a', 'r', 's']:
        word = lemmatizer.lemmatize(word, pos=pos)
        
    if len(word) == 0:
        return ''
    
    if word[-1] == 's':
        word = word[:-1]
        
    return word

In [4]:
idiom_exceptions = {
    "also-ran": "also ran",
}

In [6]:
lemm_idioms = []
for idiom in data['Idiom']:
    if idiom in idiom_exceptions.keys():
        lemm_idioms.append(idiom_exceptions[idiom])
        continue
    
    idiom = idiom.replace("someone's", '').replace('someone', '')
    idiom = idiom.replace("one's", '')
    idiom = idiom.replace('-', ' ')
    lemm_idioms.append(' '.join([full_lemmatize(word) for word in idiom.split()]))
    

In [8]:
data['lemmatized'] = lemm_idioms
new_data = data[['Idiom', 'lemmatized']]
new_data.head()

Unnamed: 0,Idiom,lemmatized
0,American Dream,american dream
1,Catch-22,catch 22
2,Christmas present,christma present
3,Downing Street,down street
4,Dutch courage,dutch courage


In [7]:
new_data.to_csv('idiomLexicon_lemmatized.csv', sep=';', index=False)