A lexicon, or lexical resource, is a collection of words and/or phrases along with associated information such as part of speech and sense definitions

In [1]:
import nltk

In [14]:
def get_unusual_words(text):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    eng_vocab = set(w.lower() for w in nltk.corpus.words.words()) # All english words in NLTK
    return sorted(text_vocab - eng_vocab)

In [15]:
get_unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt'))

['abbeyland',
 'abhorred',
 'abilities',
 'abounded',
 'abridgement',
 'abused',
 'abuses',
 'accents',
 'accepting',
 'accommodations',
 'accompanied',
 'accounted',
 'accounts',
 'accustomary',
 'aches',
 'acknowledging',
 'acknowledgment',
 'acknowledgments',
 'acquaintances',
 'acquiesced',
 'acquitted',
 'acquitting',
 'acted',
 'actions',
 'adapted',
 'adding',
 'additions',
 'addressed',
 'addresses',
 'addressing',
 'adhering',
 'adieus',
 'adjusting',
 'administering',
 'admirers',
 'admires',
 'admitting',
 'adorned',
 'advances',
 'advantages',
 'affairs',
 'affections',
 'affects',
 'affixed',
 'afflictions',
 'afforded',
 'affording',
 'ages',
 'agitated',
 'agonies',
 'ailments',
 'aimed',
 'alarms',
 'alienated',
 'alighted',
 'alleged',
 'allenham',
 'allowances',
 'allowed',
 'allowing',
 'alluded',
 'alterations',
 'altered',
 'altering',
 'amended',
 'amounted',
 'amusements',
 'ankles',
 'annamaria',
 'annexed',
 'announced',
 'announcing',
 'annuities',
 'annum',
 

In [10]:
# All stopwords in NLTK 
print(nltk.corpus.stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
def content_fraction(text):
    # Find non-stopword content in a given text
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return len(content) / len(text)

In [13]:
# There are just around 59% words are content words
content_fraction(nltk.corpus.gutenberg.words('bible-kjv.txt'))

0.5874977984552577

In [16]:
# CMU Pronouncing Dictionary for US English
# which was designed for use by speech synthesizers.
entries = nltk.corpus.cmudict.entries()

In [18]:
# (word, a list of phonetic codes for pronunciations)
for e in entries[:10]:
    print(e)

('a', ['AH0'])
('a.', ['EY1'])
('a', ['EY1'])
('a42128', ['EY1', 'F', 'AO1', 'R', 'T', 'UW1', 'W', 'AH1', 'N', 'T', 'UW1', 'EY1', 'T'])
('aaa', ['T', 'R', 'IH2', 'P', 'AH0', 'L', 'EY1'])
('aaberg', ['AA1', 'B', 'ER0', 'G'])
('aachen', ['AA1', 'K', 'AH0', 'N'])
('aachener', ['AA1', 'K', 'AH0', 'N', 'ER0'])
('aaker', ['AA1', 'K', 'ER0'])
('aalseth', ['AA1', 'L', 'S', 'EH0', 'TH'])


In [19]:
# Comaparative dictionary
from nltk.corpus import swadesh
print(f'There are {len(swadesh.fileids())} languages in the corpus')

There are 24 languages in the corpus


In [23]:
# Use the swadesh for a simple translation
fr2en = swadesh.entries(['fr', 'en'])
trans = dict(fr2en)
trans

{'je': 'I',
 'tu, vous': 'you (singular), thou',
 'il': 'he',
 'nous': 'we',
 'vous': 'you (plural)',
 'ils, elles': 'they',
 'ceci': 'this',
 'cela': 'that',
 'ici': 'here',
 'là': 'there',
 'qui': 'who',
 'quoi': 'what',
 'où': 'where',
 'quand': 'when',
 'comment': 'how',
 'ne...pas': 'not',
 'tout': 'all',
 'plusieurs': 'many',
 'quelques': 'some',
 'peu': 'few',
 'autre': 'other',
 'un': 'one',
 'deux': 'two',
 'trois': 'three',
 'quatre': 'four',
 'cinq': 'five',
 'grand': 'big',
 'long': 'long',
 'large': 'wide',
 'épais': 'thick',
 'lourd': 'heavy',
 'petit': 'small',
 'court': 'short',
 'étroit': 'narrow',
 'mince': 'thin',
 'femme': 'woman',
 'homme': 'man (human being)',
 'enfant': 'child',
 'femme, épouse': 'wife',
 'mari, époux': 'husband',
 'mère': 'mother',
 'père': 'father',
 'animal': 'animal',
 'poisson': 'fish',
 'oiseau': 'bird',
 'chien': 'dog',
 'pou': 'louse',
 'serpent': 'snake',
 'ver': 'worm',
 'arbre': 'tree',
 'forêt': 'forest',
 'bâton': 'stick',
 'fruit': 

In [24]:
trans['chien']

'dog'