<a href="https://colab.research.google.com/github/ArvindSinghRawat/Spellcast-Bot/blob/feature%2Fv1%2Farvind%2Fscraping-logic/docs/scripts/NLTK_logic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logic to create huge corpus of words using NLTK

## Setup

### Setting dependencies

In [None]:
!pip3 install --user -U nltk

### General imports

In [None]:
from nltk.corpus import nombank, propbank, stopwords, brown
import nltk
import re
import json
from typing import List, Dict, Any
from enum import Enum

### Adding different corpus to populate data

In [None]:
nltk.download("brown")
nltk.download("nombank.1.0")
nltk.download("propbank")
nltk.download('stopwords')

In [None]:
# Ensuring all of them are downloaded
brown.ensure_loaded()
nombank.ensure_loaded()
propbank.ensure_loaded()
stopwords.ensure_loaded()

## Preprocessing

### Preparing types of words expected

In [None]:
word_dict = dict()
known_types = dict()
DESCRIPTION = 'description'
WORD_LIST = 'word_list'

In [None]:
# Preparing types

known_types['CC'] = {DESCRIPTION: 'coordinating conjunction (and, or)'}
known_types['CD'] = {DESCRIPTION: 'cardinal numeral (one, two, 2, etc.)'}
known_types['CS'] = {DESCRIPTION: 'subordinating conjunction (if, although)'}
known_types['EX'] = {DESCRIPTION: 'existential there'}
known_types['IN'] = {DESCRIPTION: 'preposition (in, at, on)'}
known_types['JJ'] = {DESCRIPTION: 'adjective'}
known_types['JJA'] = {DESCRIPTION: 'adjective + Auxiliary'}
known_types['JJC'] = {DESCRIPTION: 'adjective, Comparative'}
known_types['JJCC'] = {DESCRIPTION: 'Adjective + Conjunction'}
known_types['JJS'] = {DESCRIPTION: 'semantically superlative adjective (chief, top)'}
known_types['JJF'] = {DESCRIPTION: 'Adjective + Female'}
known_types['JJM'] = {DESCRIPTION: 'Adjective + Male'}
known_types['NN'] = {DESCRIPTION: 'singular or mass noun'}
known_types['NNA'] = {DESCRIPTION: 'Noun + Auxiliary'}
known_types['NNC'] = {DESCRIPTION: 'Noun + Conjunction'}
known_types['NNS'] = {DESCRIPTION: 'plural noun'}
known_types['NNP'] = {DESCRIPTION: 'proper noun or part of name phrase'}
known_types['NNPC'] = {DESCRIPTION: 'proper noun + Conjunction'}
known_types['PRP'] = {DESCRIPTION: 'personal pronoun, singular'}
known_types['PRPS'] = {DESCRIPTION: 'personal pronoun, plural'}
known_types['PRP$'] = {DESCRIPTION: 'Possessive pronoun'}
known_types['RB'] = {DESCRIPTION: 'adverb'}
known_types['RBR'] = {DESCRIPTION: 'comparative adverb'}
known_types['RBS'] = {DESCRIPTION: 'superlative adverb'}
known_types['STP'] = {DESCRIPTION: 'stopwords'}
known_types['VB'] = {DESCRIPTION: 'verb, base form'}
known_types['VBA'] = {DESCRIPTION: 'verb + Auxiliary, singular, present'}
known_types['VBD'] = {DESCRIPTION: 'verb, past tense'}
known_types['VBG'] = {DESCRIPTION: 'verb, present participle/gerund'}
known_types['VBN'] = {DESCRIPTION: 'verb, past participle'}
known_types['VBZ'] = {DESCRIPTION: 'verb, 3rd. singular present'}

### Methods to pre-process text or filter out unwanted words

In [None]:
def process_raw_text(line: str) -> str:
  """Reads the raw text and filters unwanted data out of it. Returns List with single word in each element
  """
  # TODO: substitute `- ` with empty word
  # TODO: Split words, add words greater than 1 char in list
  # TODO: convert words to small caps
  line = re.sub("-\s+", "", line)
  word = line.strip()
  all_same = all(ch == word[0] for ch in word)
  if len(word) > 1 and word.isalpha() and not word.isupper() and not all_same:
    return word.lower()

  return None

### Preparing map of words tagged with proper category

In [None]:
def find_total_count(word_dict):
  final_count = 0
  for (key, value) in word_dict.items():
    final_count += len(value)
  return final_count

In [9]:
# Brown corpus data processing

for (word, tag) in brown.tagged_words():
  if tag in known_types:
    ls = word_dict.get(tag, None)
    if ls is None:
      ls = list()
      word_dict[tag] = ls
    word = process_raw_text(word)
    if word is not None and word not in ls:
      ls.append(word)

find_total_count(word_dict)

35972

In [10]:
# # Stop words data processing

# stopword_list = word_dict.get('STP', None)
# if stopword_list is None:
#   stopword_list = list()
#   word_dict['STP'] = stopword_list
# for word in stopwords.words('english'):
#   word = process_raw_text(word)
#   if word is not None and word not in stopword_list:
#     stopword_list.append(word)
# stopword_list = None

# find_total_count(word_dict)

In [11]:
# Nombank data processing

noun_list = word_dict.get('NN', None)
if noun_list is None:
  noun_list = list()
  word_dict['NN'] = noun_list
for word in nombank.nouns():
  word = process_raw_text(word)
  if word is not None and word not in noun_list:
    noun_list.append(word)
noun_list = None

find_total_count(word_dict)

36926

In [12]:
# Propbank data processing

verb_list = word_dict.get('VB', None)
if verb_list is None:
  verb_list = list()
  word_dict['VB'] = verb_list
for word in propbank.verbs():
  word = process_raw_text(word)
  if word is not None and word not in verb_list:
    verb_list.append(word)
verb_list = None

find_total_count(word_dict)

38139

## Util classes and methods

In [13]:
# Defining constants
class WordType(Enum):
  VERB = 'v.'
  NOUN = 'n.'
  ABBREVIATION = 'abbr.'
  ADJECTIVE = 'adj.'
  ADVERB = 'adv.'
  NUMBER = 'num.'
  PRONOUN = 'p.'
  CONJUNCTION = 'conj.'
  PREPOSITION = 'pre.'
  INTERJECTION = 'int.'


In [14]:
def map_type(input: str) -> WordType: 
  if input is None:
    return None
  input = input.strip()
  if len(input) < 1:
    return None
  if input == 'v' or input.startswith('VB'):
    return WordType.VERB
  elif input == 'adj' or input.startswith('JJ') :
    return WordType.ADJECTIVE
  elif input == 'n' or input.startswith('NN'):
    return WordType.NOUN
  elif input == 'abbr':
    return WordType.ABBREVIATION
  elif input == 'adv' or input.startswith('RB'):
    return WordType.ADVERB
  elif input in ['CC', 'CS']:
    return WordType.CONJUNCTION
  elif input == 'CD':
    return WordType.NUMBER
  elif input == 'EX' or input.startswith('PR'):
    return WordType.PRONOUN
  elif input in ['IN', 'STP']:
    return WordType.PREPOSITION
  else:
    return None

## Actual processing

### Prepare standard Json
Like
```json
{
   "<ACTUAL_WORD>" : {
     "word": "<ACTUAL_WORD>",
     "meaning": [
      {
         "value" : "<ACTUAL_MEANING>",
         "type" : "NOUN|VERB|ADJ|ABBR",
         "index" : 0
      },
      {
         ... // Different meaning or type
      }
      ...  // Other meanings or types
   ]
   },
   ... // More Words
}
```

Here, Json object against the outermost root key i.e. `<ACTUAL_WORD>` is nullable, which means there is no context for the word.

#### Context and expectation
Here, all of the words didn't had any meaning with them. So, the only thing that need to be saved is type and word. Finally, the output would be:
```json
{
  "<ACTUAL_WORD>": {
    "meaning": [
      {
        "type": "NOUN|VERB|ADJ|ABBR",
        "index": 0
      },
      {
        ...
      }
      ... // More types, maybe
    ]
  },
  ... // More words
}
```

In [15]:
standard_dict = dict()
for (tag , word_list) in word_dict.items():
  word_type = map_type(tag)
  for word in word_list:
    details = standard_dict.get(word, None)
    if details is None:
      details = dict()
      standard_dict[word] = details
    ls = details.get('meaning', list())
    do_skip = False
    for meaning in ls:
      if meaning.get('type', None) == word_type.name:
        do_skip = True
        break
    if do_skip:
      continue
    ls.append({
        'type': word_type.name,
        'index': len(ls)
    })
    details['meaning'] = ls

In [17]:
# Serializing json
json_object = json.dumps(standard_dict, separators=(',', ':'))
 
# Writing to sample.json
with open("output/dictionary-nltk.json", "w") as outfile:
    outfile.write(json_object)