In [1]:
import os
from random import choice

# load random story
def random_story(folder=None):
    if folder is None:
        folder = choice(os.listdir('ESL-stuff'))
        
    story = choice(os.listdir('ESL-stuff/' + folder))
    print(folder + '/' + story)
    with open('ESL-stuff/' + folder + '/' + story, 'r') as f:
        story = f.readlines()
        # get rid of empty lines
        story = [line for line in story if line != '\n']
    return story

In [2]:
def get_random_sentence(story):
    """Get a random sentence from a story."""
    paragraph = choice(story).split('. ')
    sentence = choice(paragraph)
    if len(sentence) < 4: return get_random_sentence(story)

    if sentence[0].islower(): return get_random_sentence(story)
    
    return sentence + "."

In [3]:
story = random_story("level")
story

level/file-3-24.txt


['Kenneth Baker woke up early Sunday morning. A terrible thing had happened. His cat was missing. "He must have run away again," thought Kenneth. He went around asking neighbors for information. "That\'s odd," said his neighbor Sandra. "My Chihuahua went missing last night, too!" Kenneth kept walking around the neighborhood. He hoped to find his cat roaming the streets. Instead, Kenneth found something else. There were signs everywhere. They all said, "Missing cat" or "Missing Dog". "I don\'t think all these animals are running away," thought Kenneth. Then, Kenneth saw something that broke his heart. There was a trail of blood on the floor. It led to the mountain side. A coyote must be taking all the small pets in the night. ']

In [4]:
get_random_sentence(story)

'"He must have run away again," thought Kenneth.'

In [5]:
# find verb in sentence
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

def find_verb(sentence):
    verbs = []
    pos_tags = pos_tag(word_tokenize(sentence))
    print(pos_tags)
    for word, tag in pos_tags:
        if tag.startswith('VBD') or tag.startswith('VBP'):
            verbs.append(word)

    if len(verbs) == 0:
        for word, tag in pos_tags:
            if tag.startswith('VBZ') or tag.startswith('VB'):
                verbs.append(word)
    return verbs

In [6]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
# nltk.download('brown')
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())

from nltk.corpus import wordnet

def create_similar_word_bank(words, num=4):
    bank = set()

    for word in words:
        # similar_words = wordnet.similar_tos('dog')
        for syn in wordnet.synsets(word):
            bank.add(syn.lemmas()[0].name())

        """ for word in similar_words:
            bank.append(word) """
        # bank.append(text.similar(word, num=num))

    return bank


In [8]:
sentence = get_random_sentence(random_story())
verbs = find_verb(sentence)
print("Similar Verbs", create_similar_word_bank(verbs))
new_sentence = [sentence.replace(verb, "_"*len(verb), 1) for verb in verbs]
print(new_sentence)

level/file-6-72.txt
[('Laura', 'NNP'), ('put', 'VBD'), ('the', 'DT'), ('book', 'NN'), ('back', 'RB'), ('in', 'IN'), ('her', 'PRP$'), ('bookshelf', 'NN'), ('and', 'CC'), ('never', 'RB'), ('touched', 'VBD'), ('it', 'PRP'), ('again', 'RB'), ('.', '.')]
Similar Verbs {'put_option', 'refer', 'place', 'touch', 'affect', 'reach', 'equal', 'tint', 'arrange', 'touched', 'fey', 'put', 'partake', 'invest', 'allude', 'moved', 'frame'}
['Laura ___ the book back in her bookshelf and never touched it again.', 'Laura put the book back in her bookshelf and never _______ it again.']


CC - coordinating conjunction

CD - cardinal number

DT - determiner

E - existential there (e.g., "there is")

FW - foreign word

IN - preposition or subordinating conjunction

JJ - adjective

JJR - comparative adjective

JJS - superlative adjective

LS - list item marker

MD - modal verb

NN - noun, singular or mass

NNS - noun, plural

NNP - proper noun, singular

NNPS - proper noun, plural

PDT - predeterminer

POS - possessive pronoun

PRP - personal pronoun

PRP$ - possessive pronoun

RB - adverb

RBR - comparative adverb

RBS - superlative adverb

詞 - Japanese particle

SYM - Chinese character

TO - infinitive marker

UH - interjection

VB - infinitive marker

VBD - past tense verb

VBG - past participle verb

VBN - past participle verb

VBP - present tense, perfect, and past participle verb

VBZ - present tense, present participle verb


In [9]:
from nltk.stem import WordNetLemmatizer

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def get_infinitive_form(word):
    return "to " + lemmatizer.lemmatize(word, pos='v')


In [10]:
sentences = []
problems = int(input("Enter the number of questions"))
while len(sentences) < problems:
    random_sentence = get_random_sentence(random_story())
    verbs = find_verb(random_sentence)

    if len(verbs) > 1 or len(verbs) == 0: continue

    new = random_sentence.replace(verbs[0], "_"*len(verbs[0]), 1)
    print(verbs[0])

    if new not in sentences:
        sentences.append([new + f" ({get_infinitive_form(verbs[0])})", verbs[0]])

essays/file-60.txt
[('Chicago', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('home', 'NN'), ('of', 'IN'), ('other', 'JJ'), ('famous', 'JJ'), ('firsts', 'NNS'), ('.', '.')]
is
level/file-3-86.txt
[('He', 'PRP'), ('needed', 'VBD'), ('to', 'TO'), ('watch', 'VB'), ('the', 'DT'), ('Baseball', 'NNP'), ('final', 'JJ'), ('.', '.')]
needed
essays/file-33.txt
[('If', 'IN'), ('you', 'PRP'), ('pass', 'VBP'), ('this', 'DT'), ('test', 'NN'), (',', ','), ('you', 'PRP'), ('can', 'MD'), ('practice', 'NN'), ('driving', 'VBG'), ('so', 'IN'), ('you', 'PRP'), ('can', 'MD'), ('pass', 'VB'), ('a', 'DT'), ('road', 'NN'), ('test', 'NN'), ('and', 'CC'), ('get', 'VB'), ('your', 'PRP$'), ('license', 'NN'), ('.', '.')]
pass
couple/file-9.txt
[('He', 'PRP'), ('stumbled', 'VBD'), ('upon', 'RP'), ('a', 'DT'), ('steakhouse', 'NN'), ('located', 'VBN'), ('a', 'DT'), ('few', 'JJ'), ('miles', 'NNS'), ('north', 'RB'), ('of', 'IN'), ('where', 'WRB'), ('they', 'PRP'), ('lived', 'VBD'), ('.', '.')]
essays/file-26.txt
[('begin', 'VB

In [11]:
worksheet = "\tFill in the Verb\n\n"

for index, sentence in enumerate(sentences): 
    worksheet += f"{index + 1}. {sentence[0]}\n"

worksheet += "\n\n\tAnswer Key\n\n"

for index, sentence in enumerate(sentences):
    worksheet += f"{index + 1}. {sentence[1]}\n"

print(worksheet)

	Fill in the Verb

1. Chicago __ the home of other famous firsts. (to be)
2. He ______ to watch the Baseball final. (to need)
3. If you ____ this test, you can practice driving so you can pass a road test and get your license. (to pass)
4.  begin school at age 5, when they __ to kindergarten. (to go)
5. Both the New York Philharmonic Orchestra and the Metropolitan Opera also ____ free performances in the park in the spring and summer. (to host)
6. She would make sure no one ___ looking at her. (to be)
7. He _______ two slices. (to grab)
8. Eastwood ___ born in 1930 in San Francisco before coming to Los Angeles at an early age. (to be)
9. She ____ into her car. (to get)
10. New Year's Day, January 1, __ a national holiday in the United States. (to be)


	Answer Key

1. is
2. needed
3. pass
4. go
5. host
6. was
7. grabbed
8. was
9. gets
10. is



In [12]:
from nltk.corpus import wordnet 

In [24]:
word = "American"

print("Alternate Definitions")
for syn in wordnet.synsets(word):
    print(syn.definition())

Alternate Definitions
a native or inhabitant of the United States
the English language as used in the United States
a native or inhabitant of a North American or Central American or South American country
of or relating to the United States of America or its people or language or culture
of or relating to or characteristic of the continents and islands of the Americas


In [14]:
# get all complex nouns in sentence
# input: sentence
# output: list of complex nouns
def get_complex_nouns(sentence):
    nouns = []
    for chunk in sentence.noun_chunks:
        if len(chunk.text.split()) > 1:
            nouns.append(chunk.text)
    return nouns

In [32]:
import nltk
from nltk.corpus import wordnet
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('words')

# Sample sentence
# sentence = "This is a complicated sentence with various intricate words and powerful emotions."
story = random_story()
sentence = ""

for sent in story:
    sentence += sent + " "

print(sentence)

# Tokenize the sentence
words = word_tokenize(sentence)

# Calculate word frequency using NLTK's FreqDist
word_freq = FreqDist(words)

corpus = nltk.corpus.brown.words()  # Replace with your own corpus if available
corpus_freq = FreqDist(corpus)

# Function to calculate lexical score based on word frequency
def calculate_lexical_score(word):
    word = word.lower()
    synsets = wordnet.synsets(word)
    if synsets:
        # Consider the number of synsets (senses) for the word
        return len(synsets) * 10 * len(word) / (corpus_freq[word] + 1)
    else:
        # Assign a low score to words not found in WordNet
        return 0  # You can adjust this as needed

# Calculate lexical scores for each word in the sentence
lexical_scores = {word: calculate_lexical_score(word) for word in words}

# Print the lexical scores
for word, score in lexical_scores.items():
    print(f"{word}: {score:.2f}")

# Print top 5 words with highest lexical scores
print("\nTop 5 words with highest lexical scores:")
for word, score in sorted(lexical_scores.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"{word}: {score:.2f}")

# function that returns sorted lexical scores
def get_lexical_scores(story):
    sentence = ""

    for sent in story:
        sentence += sent + " "

    words = word_tokenize(sentence.lower())
    # remove duplicate words in sentence
    words = list(dict.fromkeys(words))

    lexical_scores = {word: calculate_lexical_score(word) for word in words}
    return sorted(lexical_scores.items(), key=lambda x: x[1], reverse=True)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


customs/file-17.txt
Veterans Day is an official United States holiday observed on November 11th every year. While Memorial Day remembers those who have died while serving in any of the branches of the U.S. Armed Forces -- Army, Navy, Marines, and Coast Guard, Veterans Day honors all members of the armed forces, especially the living. As a U.S. federal holiday, government offices and schools are closed, and most people have the day off from work. Banks are also closed. If November 11th falls on a Sunday, then the holiday is observed on the following Monday. The holiday was first observed after World War I, which ended On November 11 1918. It was called Armistice Day then and was more about celebrating the absence of war and honoring those that served in that one war. It didn't become known as Veterans Day until 1954 after World War II and the Korean War, when there were many more members of the armed forces. Many restaurants offer free meals to veterans on the date. There is always a sp

In [16]:
import nltk
from nltk.tokenize import SyllableTokenizer

# Initialize the SyllableTokenizer
tokenizer = SyllableTokenizer()

def get_syllables(word):
    # Use the SyllableTokenizer to break the word into syllables
    syllables = tokenizer.tokenize(word)

    # Join the syllables with a dot (·)
    syllables_with_dots = ' · '.join(syllables)

    # Print the word with syllables separated by dots
    return syllables_with_dots


In [17]:
from nltk.corpus import cmudict
nltk.download('cmudict')

def get_sounds(text):
    d = cmudict.dict()

    phonetics = d[text.lower()]

    sounds = [sound[:-1] if sound[-1] in "0123" else sound for sound in phonetics[0]]
    line = " ".join(sounds)
    return line

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [18]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Sentence containing verbs with different tenses
story = random_story()
sentence = " ".join(story)
print(sentence)

# Tokenize the sentence
words = word_tokenize(sentence)

# Perform part-of-speech tagging
pos_tags = pos_tag(words)

# Initialize lists to categorize verbs by tense
infinitive_verbs = []
past_tense_verbs = []
present_participle_verbs = []
past_participle_verbs = []
present_simple_verbs = []

# Iterate through the tagged words to categorize verbs by tense
for word, tag in pos_tags:
    if tag == "VB":
        infinitive_verbs.append(word)
    elif tag == "VBD":
        past_tense_verbs.append(word)
    elif tag == "VBG":
        present_participle_verbs.append(word)
    elif tag == "VBN":
        past_participle_verbs.append(word)
    elif tag == "VBP" or tag == "VBZ":
        present_simple_verbs.append(word)

# Print the categorized verbs
print("Infinitive Verbs:", infinitive_verbs)
print("Past Tense Verbs:", past_tense_verbs)
print("Present Participle Verbs (Gerunds):", present_participle_verbs)
print("Past Participle Verbs:", past_participle_verbs)
print("Present Simple Verbs (3rd person singular and present participle):", present_simple_verbs)


essays/file-64.txt
Fires are destructive. They also create changes. In 1871 a fire burned most of Chicago to the ground. Many people think the fire was started when a cow knocked over a gas lamp. Most of the city at the time was built of wood. This, combined with the famous Chicago winds and a drought, made the city burn down quickly. The fire lasted for three days. 100,000 people were left homeless, and at least 300 were killed. What was amazing was how quickly the city was rebuilt, eventually becoming the third most populous city in the United States. The Great Boston Fire of 1872 created a property damage of $73.5 million,  more than any other fire in the U.S. history. Most of Downtown Boston and the financial district burned down in the fire that began in a warehouse basement. 30 people died, and thousands lost their jobs and their homes. However, the city was rebuilt in two years. It began enforcing building regulations because of the fire. The 1911 Triangle Shirtwaist Factory fir

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [50]:
from random import shuffle

# get a random story and create a function to get the 10 hardest words in it and create a worksheet with those words and their definitions
story = random_story()
hardest_words = [word[0] for word in get_lexical_scores(story)[:20]]
definitions = [wordnet.synsets(word)[0].definition() for word in hardest_words]

# remove duplicate definitions and words
for definition in definitions:
    if definitions.count(definition) > 1:
        index = definitions.index(definition)
        definitions.pop(index)
        hardest_words.pop(index)

answers = ""

for index, word in enumerate(hardest_words):
    answers += f"{index + 1}. {word} - {definitions[index]}\n"

shuffle(definitions)

alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".lower()

worksheet = "\tVocabulary Worksheet\n\n"
for index, word in enumerate(hardest_words):
    worksheet += f"{index + 1}. {word}\t\t\t{alphabet[index]}. {definitions[index]}\n"

worksheet += "\n\n\tAnswer Key\n\n"
worksheet += answers
print(worksheet)

couple/file-45.txt
	Vocabulary Worksheet

1. pensive			a. operate a dial to select a telephone number
2. sunday			b. device for converting sound waves into electrical energy
3. picking			c. a dark region of considerable extent on the surface of the moon
4. cans			d. deeply or seriously thoughtful
5. maria			e. as cold as ice
6. mike			f. yielding readily to pressure or weight
7. ice-cold			g. deliver a sharp blow or push :
8. cleaners			h. 1/60 of a minute; the basic unit of time adopted under the Systeme International d'Unites
9. dialed			i. first day of the week; observed as a day of rest and worship by most Christians
10. seconds			j. underpants worn by women
11. pants			k. the quantity of a crop that is harvested
12. hang			l. a special way of doing something
13. invited			m. pat or squeeze fondly or playfully, especially under the chin
14. grabbed			n. airtight sealed metal container for food or drink or paint etc.
15. cheerfully			o. for a short time
16. couch			p. take hold of s

[('siblings', 80.0),
 ('keys', 47.05882352941177),
 ('TV', 40.0),
 ('clean', 23.846153846153847),
 ('drove', 19.841269841269842),
 ('drive', 18.085106382978722),
 ('offered', 10.833333333333334),
 ('I', 10.0),
 ('complained', 8.695652173913043),
 ('watching', 8.533333333333333)]