In [2]:
import nltk
from nltk.corpus import brown, inaugural, reuters, udhr
from nltk import FreqDist, ConditionalFreqDist, pos_tag, word_tokenize
from nltk.tag import DefaultTagger, UnigramTagger
from nltk.corpus import PlaintextCorpusReader

In [3]:
# Download required datasets
nltk.download('brown')
nltk.download('inaugural')
nltk.download('reuters')
nltk.download('udhr')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package brown to C:\Users\Durgesh
[nltk_data]     Babu\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package inaugural to C:\Users\Durgesh
[nltk_data]     Babu\AppData\Roaming\nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
[nltk_data] Downloading package reuters to C:\Users\Durgesh
[nltk_data]     Babu\AppData\Roaming\nltk_data...
[nltk_data] Downloading package udhr to C:\Users\Durgesh
[nltk_data]     Babu\AppData\Roaming\nltk_data...
[nltk_data]   Package udhr is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Durgesh Babu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to C:\Users\Durgesh
[nltk_data]     Babu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Study Various Corpora
def study_corpus():
    print("Brown Corpus Categories:", brown.categories())
    print("First 100 words of Inaugural Corpus:", inaugural.words()[:100])
    print("First 100 words of Reuters Corpus:", reuters.words()[:100])
    print("First 100 words of UDHR Corpus:", udhr.words('English-Latin1')[:100])

In [7]:
# Create and Use Custom Corpora
corpus_root = 'custom_corpus/'  # Ensure this folder exists with text files
custom_corpus = PlaintextCorpusReader(corpus_root, '.*')

In [8]:
# Study Conditional Frequency Distributions
def study_cfd():
    cfd = ConditionalFreqDist(
        (genre, word)
        for genre in brown.categories()
        for word in brown.words(categories=genre)
    )
    print("Most common words in 'news' category:", cfd['news'].most_common(10))

In [9]:
# Study Tagged Corpora
def study_tagged_corpora():
    print("First 10 Tagged Sentences from Brown:", brown.tagged_sents()[:10])
    print("First 10 Tagged Words from Brown:", brown.tagged_words()[:10])

In [10]:
# Find Most Frequent Noun Tags
def most_frequent_nouns(text):
    tokens = word_tokenize(text)
    tagged_words = pos_tag(tokens)
    fdist = FreqDist(tag for word, tag in tagged_words if tag.startswith('NN'))
    return fdist.most_common(10)

In [11]:
# Map Words to Properties Using Python Dictionaries
word_properties = {
    'run': {'POS': 'verb', 'meaning': 'move swiftly'},
    'book': {'POS': 'noun', 'meaning': 'collection of pages'}
}


In [12]:
# Study Rule-Based Tagger and Unigram Tagger
def study_taggers():
    default_tagger = DefaultTagger('NN')
    unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
    sample_text = word_tokenize("The quick brown fox jumps over the lazy dog")
    print("Default Tagger Output:", default_tagger.tag(sample_text))
    print("Unigram Tagger Output:", unigram_tagger.tag(sample_text))

In [13]:
# Function to find words from a given text without spaces
def split_text_to_words(text, corpus_words):
    found_words = []
    i = 0
    while i < len(text):
        for j in range(i + 1, len(text) + 1):
            if text[i:j] in corpus_words:
                found_words.append(text[i:j])
                i = j - 1
                break
        i += 1
    return found_words, len(found_words)


In [14]:
# Example Usage
study_corpus()
study_cfd()
study_tagged_corpora()
study_taggers()

text = "runningbookfastcar"
corpus_words = set(brown.words())
found_words, score = split_text_to_words(text, corpus_words)
print("Extracted Words:", found_words)
print("Score:", score)

Brown Corpus Categories: ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
First 100 words of Inaugural Corpus: ['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', ...]
First 100 words of Reuters Corpus: ['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', ...]
First 100 words of UDHR Corpus: ['Universal', 'Declaration', 'of', 'Human', 'Rights', 'Preamble', 'Whereas', 'recognition', 'of', 'the', 'inherent', 'dignity', 'and', 'of', 'the', 'equal', 'and', 'inalienable', 'rights', 'of', 'all', 'members', 'of', 'the', 'human', 'family', 'is', 'the', 'foundation', 'of', 'freedom', ',', 'justice', 'and', 'peace', 'in', 'the', 'world', ',', 'Whereas', 'disregard', 'and', 'contempt', 'for', 'human', 'rights', 'have', 'resulted', 'in', 'barbarous', 'acts', 'which', 'have', 'outraged', 'the', 'conscience', 'of', 'mankind', ',', 'and', 'the', 'advent', 'of', 'a', 'wor