# Baisc Setting

In [None]:
# Basic libraries
import os
import re
import time
import pandas as pd
from collections import Counter

# NLTK
import nltk
from nltk.corpus import brown, gutenberg, webtext, reuters, movie_reviews

# Stanza
import stanza

# Google Colab
from google.colab import drive

# NLTK downloads
nltk.download('brown')
nltk.download('gutenberg')
nltk.download('webtext')
nltk.download('reuters')
nltk.download('movie_reviews')
nltk.download('averaged_perceptron_tagger')

# Stanza download
stanza.download('en')

# Mount Google Drive
drive.mount('/content/drive')

# Example usage of corpora
corpora = {
    "Brown": brown.sents(),
    "Gutenberg": gutenberg.sents(),
    "Webtext": webtext.sents(),
    "Reuters": reuters.sents(),
    "Movie Reviews": movie_reviews.sents()
}

for corpus_name, sentences in corpora.items():
    print(f"{corpus_name} Corpus example sentence:", sentences[0])

# nltk.download('conll2000')
# from nltk.corpus import conll2000
# conll2000_sentences = conll2000.sents()
# print("Conll2000 Corpus example sentence:", conll2000_sentences[0])

# Standford CoreNLP

In [None]:
# # install Java
# !apt-get install -y openjdk-11-jdk

# # download and unzip Stanford CoreNLP
# !wget http://nlp.stanford.edu/software/stanford-corenlp-4.4.0.zip
# !unzip stanford-corenlp-4.4.0.zip


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java libatk-wrapper-java-jni libfontenc1
  libice-dev libsm-dev libxkbfile1 libxt-dev libxtst6 libxxf86dga1 openjdk-11-jre x11-utils
Suggested packages:
  libice-doc libsm-doc libxt-doc openjdk-11-demo openjdk-11-source visualvm mesa-utils
The following NEW packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java libatk-wrapper-java-jni libfontenc1
  libice-dev libsm-dev libxkbfile1 libxt-dev libxtst6 libxxf86dga1 openjdk-11-jdk openjdk-11-jre
  x11-utils
0 upgraded, 14 newly installed, 0 to remove and 45 not upgraded.
Need to get 5,521 kB of archives.
After this operation, 15.8 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-core all 2.37-2build1 [1,041 kB]
Get:2 http://archive.ubuntu.com/ubun

In [None]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.8.2-py3-none-any.whl (990 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m990.1/990.1 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emoji (from stanza)
  Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.3.0->stanza)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.3.0->stanza)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.3.0->stanza)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.3.0->stanza)
  Using cached nvidia_cudnn_cu12-8.9.2.26-p

In [None]:
import stanza

# download model
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/default.zip:   0%|          | 0…

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources


In [None]:
# import os
# from subprocess import Popen, PIPE

# # set CoreNLP path
# path_to_corenlp = "/content/stanford-corenlp-4.4.0"

# # activate CoreNLP server
# corenlp_server = Popen(['java', '-mx4g', '-cp', f'{path_to_corenlp}/*', 'edu.stanford.nlp.pipeline.StanfordCoreNLPServer'],
#                         stdout=PIPE, stderr=PIPE)


In [None]:
# load stanza pipeline
nlp = stanza.Pipeline('en', processors='tokenize,lemma,pos,depparse')

# processing
doc = nlp("Stanford University is located in California. It is a great university.")

# output
for sentence in doc.sentences:
    for word in sentence.words:
        print(f'text: {word.text}\tlemma: {word.lemma}\tpos: {word.pos}\thead: {word.head}\tdeprel: {word.deprel}')


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


text: Stanford	lemma: Stanford	pos: PROPN	head: 2	deprel: compound
text: University	lemma: University	pos: PROPN	head: 4	deprel: nsubj:pass
text: is	lemma: be	pos: AUX	head: 4	deprel: aux:pass
text: located	lemma: locate	pos: VERB	head: 0	deprel: root
text: in	lemma: in	pos: ADP	head: 6	deprel: case
text: California	lemma: California	pos: PROPN	head: 4	deprel: obl
text: .	lemma: .	pos: PUNCT	head: 4	deprel: punct
text: It	lemma: it	pos: PRON	head: 5	deprel: nsubj
text: is	lemma: be	pos: AUX	head: 5	deprel: cop
text: a	lemma: a	pos: DET	head: 5	deprel: det
text: great	lemma: great	pos: ADJ	head: 5	deprel: amod
text: university	lemma: university	pos: NOUN	head: 0	deprel: root
text: .	lemma: .	pos: PUNCT	head: 5	deprel: punct


In [None]:
# corenlp_server.terminate()

# Brown

In [None]:
# Define superlative patterns
superlative_patterns = [
    (re.compile(r'\b\w+est\b'), 'ADJ'),
    (re.compile(r'\b\w+iest\b'), 'ADJ'),
    (re.compile(r'\b(?:most|least|best|worst)\s+\w+\b'), 'ADJ'),
    (re.compile(r'\b(?:most|least|best|worst)\s+\w+ly\b'), 'ADV'),
    (re.compile(r'\b(?:foremost|hindmost|inmost|innermost|nethermost|outmost|outermost|topmost|undermost|upmost|uppermost|utmost|uttermost)\b'), 'ADJ'),
    (re.compile(r'\b(?:ablest|angriest|baldest|battiest|beadiest|bitterest|blackest|blandest|blankest|bleakest|blondest|bloodiest|bluest|bluntest|blurriest|boldest|bossiest|bounciest|brainiest|brashest|brassiest|bravest|brawniest|breeziest|briefest|brightest|briskest|broadest|brownest|bubbliest|bulkiest|bumpiest|burliest|bushiest|busiest|calmest|chattiest|cheapest|cheekiest|cheeriest|chewiest|chilliest|choicest|choppiest|chubbiest|chunkiest|clammiest|classiest|cleanest|clearest|cleverest|closest|cloudiest|clumsiest|coarsest|coldest|coolest|corniest|coziest|crabbiest|craftiest|crankiest|craziest|creakiest|creamiest|creepiest|crispest|crispiest|crudest|cruelest|crumbliest|crunchiest|crustiest|cuddliest|curliest|curviest|cutest|daffiest|daintiest|dampest|dandiest|darkest|deadliest|deepest|densest|dingiest|dirtiest|dizziest|dreamiest|dreariest|dressiest|driest|droopiest|drowsiest|dullest|dumbest|dustiest|earliest|easiest|edgiest|eeriest|emptiest|evilest|faintest|fairest|falsest|fanciest|fattest|faultiest|feeblest|fewest|ficklest|fiercest|fieriest|filmiest|filthiest|finest|firmest|fittest|flabbiest|flakiest|flashiest|flattest|flimsiest|floppiest|floweriest|fluffiest|foamiest|foggiest|fondest|foolhardiest|frailest|frankest|freakiest|freest|freshest|friendliest|frilliest|friskiest|frostiest|frothiest|fruitiest|frumpiest|fullest|funniest|furriest|fussiest|fuzziest|gabbiest|gaudiest|gauntest|gawkiest|gentlest|ghastliest|giddiest|glassiest|gloomiest|glossiest|goofiest|grainiest|grandest|gravest|greasiest|greatest|greediest|greenest|grimiest|grittiest|groggiest|grossest|grouchiest|grubbiest|gruffest|grumpiest|guiltiest|gustiest|gutsiest|hairiest|handiest|handsomest|happiest|hardest|hardiest|harshest|hastiest|haughtiest|haziest|healthiest|heartiest|heaviest|heftiest|highest|hippest|hoarsest|hollowest|homeliest|hottest|hugest|humblest|hungriest|huskiest|iciest|ickiest|itchiest|itty-bittiest|jazziest|jerkiest|jolliest|juiciest|kindest|kindliest|kingliest|knobbiest|knottiest|laciest|largest|laziest|leanest|lengthiest|lightest|likeliest|littlest|liveliest|loneliest|longest|loosest|loudest|lousiest|loveliest|lowest|lowliest|luckiest|lumpiest|maddest|meanest|meekest|mellowest|merriest|messiest|mightiest|mildest|mistiest|moistest|moldiest|moodiest|muddiest|muggiest|murkiest|mushiest|narrowest|nastiest|naughtiest|neatest|neediest|newest|nicest|niftiest|nimblest|noblest|noisiest|nosiest|numbest|nuttiest|obscurest|oddest|oiliest|oldest|orneriest|palest|paltriest|perkiest|pettiest|pinkest|plainest|pleasantest|pluckiest|plumpest|plushest|politest|poorest|portliest|prettiest|prickliest|primmest|prissiest|promptest|proudest|puffiest|puniest|purest|pushiest|quaintest|queasiest|queenliest|quickest|quietest|quirkiest|rainiest|rarest|rashest|raspiest|rattiest|rawest|reddest|remotest|richest|ripest|riskiest|ritziest|roomiest|rosiest|rottenest|roughest|roundest|rudest|rustiest|saddest|safest|saintliest|saltiest|sandiest|sanest|sappiest|sassiest|sauciest|scaliest|scantiest|scarcest|scariest|scraggliest|scrappiest|scratchiest|scrawniest|scruffiest|scummiest|securest|seediest|seemliest|serenest|severest|shabbiest|shadiest|shaggiest|shakiest|shallowest|sharpest|shiest|shiftiest|shiniest|shoddiest|shortest|showiest|shrewdest|shrillest|shyest|sickest|sickliest|silkiest|silliest|simplest|sincerest|sketchiest|skimpiest|skinniest|sleekest|sleepiest|slickest|sliest|slightest|slimiest|slimmest|slipperiest|sloppiest|slowest|smallest|smartest|smelliest|smoggiest|smokiest|smoothest|snappiest|sneakiest|snootiest|snottiest|snuggest|softest|soggiest|soonest|sorest|sorriest|sourest|sparsest|speediest|spiciest|spiffiest|spikiest|spookiest|spriest|spryest|squarest|squiggliest|stalest|starkest|stateliest|staunchest|steadiest|steepest|sternest|stickiest|stiffest|stillest|stingiest|stodgiest|stormiest|straggliest|straightest|strangest|strictest|strongest|stubbiest|stuffiest|sturdiest|subtlest|sulkiest|sunniest|surest|surliest|swankiest|sweatiest|sweetest|swiftest|tackiest|tallest|tamest|tangiest|tannest|tardiest|tartest|tastiest|tautest|teeniest|teensiest|teeny-tiniest|tersest|testiest|thickest|thinnest|thirstiest|thorniest|thriftiest|tidiest|tightest|timeliest|tiniest|toothiest|toughest|trashiest|trendiest|trickiest|trimmest|truest|trustiest|twitchiest|ugliest|unhappiest|unlikeliest|unluckiest|unruliest|vaguest|vainest|vilest|wackiest|wariest|warmest|wateriest|weakest|wealthiest|weariest|weediest|weirdest|wettest|whitest|wickedest|widest|wiggliest|wildest|windiest|wisest|wispiest|wittiest|wobbliest|wooziest|wordiest|worldliest|worthiest|wriest|wryest|yummiest|zaniest|zestiest|ablest|biggest|bravest|cleverest|fattest|greatest|hottest|kindest|noblest|saddest|smallest|sweetest|whitest|wisest|youngest)\b'), 'ADJ'),
    (re.compile(r'\b(?:most beautiful|most boring|most colorful|most comfortable|most complete|most cruel|most delicious|most difficult|most evil|most expensive|most famous|most foolish|most friendly|most generous|most important|most interesting|most modern|most nervous|most popular|most renowned|most tangled|most tilted|most tired|least energetic)\b'), 'ADJ')
]

common_superlatives = {
    'best', 'worst', 'furthest', 'farthest', 'least', 'most', 'latest', 'last', 'nearest', 'dearest'
}

# Get all sentences from the Brown corpus
sentences = brown.sents()

In [None]:
# Function to initialize Stanza pipeline
def initialize_pipeline():
    return stanza.Pipeline('en', processors='tokenize,pos,lemma', use_gpu=False)

# Function to process a batch of sentences
def process_sentence_batch(sentences_batch, nlp):
    batch_adjectives = set()
    batch_adverbs = set()
    for sentence in sentences_batch:
        sentence_text = ' '.join(sentence)
        sentence_text_no_punct = re.sub(r'[.,!?;:(){}\[\]\'"@#$%^&*+=|\\/<>\~`]', '', sentence_text)
        found_adjectives = set()
        found_adverbs = set()

        # Use regex patterns to find superlatives
        for pattern, pos in superlative_patterns:
            matches = pattern.findall(sentence_text_no_punct)
            if pos == 'ADJ':
                found_adjectives.update(match.lower() for match in matches)
            elif pos == 'ADV':
                found_adverbs.update(match.lower() for match in matches)

        # Match common superlatives
        found_adjectives.update(word for word in common_superlatives if word in sentence_text_no_punct.lower())

        # If no matches found, use Stanza to find superlatives
        if not found_adjectives and not found_adverbs:
            doc = nlp(sentence_text)
            for sent in doc.sentences:
                for word in sent.words:
                    if word.upos == 'ADJ' and word.feats and 'Degree=Sup' in word.feats:
                        found_adjectives.add(word.text.lower().replace(' ', '-'))
                    elif word.upos == 'ADV' and word.feats and 'Degree=Sup' in word.feats:
                        found_adverbs.add(word.text.lower().replace(' ', '-'))

        batch_adjectives.update(found_adjectives)
        batch_adverbs.update(found_adverbs)

    return batch_adjectives, batch_adverbs

# Function to process all sentences
def process_sentences(sentences, batch_size):
    adjectives = set()
    adverbs = set()

    nlp = initialize_pipeline()

    num_batches = len(sentences) // batch_size + (1 if len(sentences) % batch_size != 0 else 0)
    for i in range(num_batches):
        batch = sentences[i*batch_size:(i+1)*batch_size]
        batch_adjectives, batch_adverbs = process_sentence_batch(batch, nlp)
        adjectives.update(batch_adjectives)
        adverbs.update(batch_adverbs)

    return adjectives, adverbs

# Record start time
start_time = time.time()

# Process sentences
batch_size = 4096
superlative_adjectives, superlative_adverbs = process_sentences(sentences, batch_size)

# Record end time
end_time = time.time()

# Calculate execution time
execution_time = end_time - start_time

# Print execution time
print("\nExecution Time:", execution_time, "seconds")

df_brown_superlative_adjectives = pd.DataFrame(list(superlative_adjectives), columns=['Word'])
df_brown_superlative_adverbs = pd.DataFrame(list(superlative_adverbs), columns=['Word'])

print("Superlative Adjectives:")
print(df_brown_superlative_adjectives.head(20))

print("\nSuperlative Adverbs:")
print(df_brown_superlative_adverbs.head(20))

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!



Execution Time: 14869.028860330582 seconds
Superlative Adjectives:
                Word
0   most circumspect
1     most elaborate
2         most tooth
3         most drill
4   most enthralling
5   most susceptible
6    most determined
7   most pervasively
8        most recent
9     most expensive
10     best location
11          steepest
12    most endearing
13        most noise
14    most politicos
15      most victims
16       most casual
17      most authors
18       most people
19  most pretentious

Superlative Adverbs:
                 Word
0      most obviously
1        most closely
2         most easily
3       most commonly
4     most eloquently
5    most politically
6     most peacefully
7        least partly
8      most seriously
9    most classically
10      most probably
11         most truly
12   most pervasively
13  most artistically
14   most financially
15  most dramatically
16  most responsively
17       least highly
18        most firmly
19   least apparently


In [None]:
output_dir = '/content/drive/My Drive/'
df_brown_superlative_adjectives = pd.DataFrame(list(superlative_adjectives), columns=['Word'])
df_brown_superlative_adverbs = pd.DataFrame(list(superlative_adverbs), columns=['Word'])

df_brown_superlative_adjectives.to_csv(os.path.join(output_dir, 'brown_superlative_adjectives.csv'), index=False)
df_brown_superlative_adverbs.to_csv(os.path.join(output_dir, 'brown_superlative_adverbs.csv'), index=False)

print("Superlative Adjectives saved to Google Drive.")
print("Superlative Adverbs saved to Google Drive.")

Superlative Adjectives saved to Google Drive.
Superlative Adverbs saved to Google Drive.


# webtext

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Define superlative patterns
superlative_patterns = [
    (re.compile(r'\b\w+est\b'), 'ADJ'),
    (re.compile(r'\b\w+iest\b'), 'ADJ'),
    (re.compile(r'\b(?:most|least|best|worst)\s+\w+\b'), 'ADJ'),
    (re.compile(r'\b(?:most|least|best|worst)\s+\w+ly\b'), 'ADV'),
    (re.compile(r'\b(?:foremost|hindmost|inmost|innermost|nethermost|outmost|outermost|topmost|undermost|upmost|uppermost|utmost|uttermost)\b'), 'ADJ'),
    (re.compile(r'\b(?:ablest|angriest|baldest|battiest|beadiest|bitterest|blackest|blandest|blankest|bleakest|blondest|bloodiest|bluest|bluntest|blurriest|boldest|bossiest|bounciest|brainiest|brashest|brassiest|bravest|brawniest|breeziest|briefest|brightest|briskest|broadest|brownest|bubbliest|bulkiest|bumpiest|burliest|bushiest|busiest|calmest|chattiest|cheapest|cheekiest|cheeriest|chewiest|chilliest|choicest|choppiest|chubbiest|chunkiest|clammiest|classiest|cleanest|clearest|cleverest|closest|cloudiest|clumsiest|coarsest|coldest|coolest|corniest|coziest|crabbiest|craftiest|crankiest|craziest|creakiest|creamiest|creepiest|crispest|crispiest|crudest|cruelest|crumbliest|crunchiest|crustiest|cuddliest|curliest|curviest|cutest|daffiest|daintiest|dampest|dandiest|darkest|deadliest|deepest|densest|dingiest|dirtiest|dizziest|dreamiest|dreariest|dressiest|driest|droopiest|drowsiest|dullest|dumbest|dustiest|earliest|easiest|edgiest|eeriest|emptiest|evilest|faintest|fairest|falsest|fanciest|fattest|faultiest|feeblest|fewest|ficklest|fiercest|fieriest|filmiest|filthiest|finest|firmest|fittest|flabbiest|flakiest|flashiest|flattest|flimsiest|floppiest|floweriest|fluffiest|foamiest|foggiest|fondest|foolhardiest|frailest|frankest|freakiest|freest|freshest|friendliest|frilliest|friskiest|frostiest|frothiest|fruitiest|frumpiest|fullest|funniest|furriest|fussiest|fuzziest|gabbiest|gaudiest|gauntest|gawkiest|gentlest|ghastliest|giddiest|glassiest|gloomiest|glossiest|goofiest|grainiest|grandest|gravest|greasiest|greatest|greediest|greenest|grimiest|grittiest|groggiest|grossest|grouchiest|grubbiest|gruffest|grumpiest|guiltiest|gustiest|gutsiest|hairiest|handiest|handsomest|happiest|hardest|hardiest|harshest|hastiest|haughtiest|haziest|healthiest|heartiest|heaviest|heftiest|highest|hippest|hoarsest|hollowest|homeliest|hottest|hugest|humblest|hungriest|huskiest|iciest|ickiest|itchiest|itty-bittiest|jazziest|jerkiest|jolliest|juiciest|kindest|kindliest|kingliest|knobbiest|knottiest|laciest|largest|laziest|leanest|lengthiest|lightest|likeliest|littlest|liveliest|loneliest|longest|loosest|loudest|lousiest|loveliest|lowest|lowliest|luckiest|lumpiest|maddest|meanest|meekest|mellowest|merriest|messiest|mightiest|mildest|mistiest|moistest|moldiest|moodiest|muddiest|muggiest|murkiest|mushiest|narrowest|nastiest|naughtiest|neatest|neediest|newest|nicest|niftiest|nimblest|noblest|noisiest|nosiest|numbest|nuttiest|obscurest|oddest|oiliest|oldest|orneriest|palest|paltriest|perkiest|pettiest|pinkest|plainest|pleasantest|pluckiest|plumpest|plushest|politest|poorest|portliest|prettiest|prickliest|primmest|prissiest|promptest|proudest|puffiest|puniest|purest|pushiest|quaintest|queasiest|queenliest|quickest|quietest|quirkiest|rainiest|rarest|rashest|raspiest|rattiest|rawest|reddest|remotest|richest|ripest|riskiest|ritziest|roomiest|rosiest|rottenest|roughest|roundest|rudest|rustiest|saddest|safest|saintliest|saltiest|sandiest|sanest|sappiest|sassiest|sauciest|scaliest|scantiest|scarcest|scariest|scraggliest|scrappiest|scratchiest|scrawniest|scruffiest|scummiest|securest|seediest|seemliest|serenest|severest|shabbiest|shadiest|shaggiest|shakiest|shallowest|sharpest|shiest|shiftiest|shiniest|shoddiest|shortest|showiest|shrewdest|shrillest|shyest|sickest|sickliest|silkiest|silliest|simplest|sincerest|sketchiest|skimpiest|skinniest|sleekest|sleepiest|slickest|sliest|slightest|slimiest|slimmest|slipperiest|sloppiest|slowest|smallest|smartest|smelliest|smoggiest|smokiest|smoothest|snappiest|sneakiest|snootiest|snottiest|snuggest|softest|soggiest|soonest|sorest|sorriest|sourest|sparsest|speediest|spiciest|spiffiest|spikiest|spookiest|spriest|spryest|squarest|squiggliest|stalest|starkest|stateliest|staunchest|steadiest|steepest|sternest|stickiest|stiffest|stillest|stingiest|stodgiest|stormiest|straggliest|straightest|strangest|strictest|strongest|stubbiest|stuffiest|sturdiest|subtlest|sulkiest|sunniest|surest|surliest|swankiest|sweatiest|sweetest|swiftest|tackiest|tallest|tamest|tangiest|tannest|tardiest|tartest|tastiest|tautest|teeniest|teensiest|teeny-tiniest|tersest|testiest|thickest|thinnest|thirstiest|thorniest|thriftiest|tidiest|tightest|timeliest|tiniest|toothiest|toughest|trashiest|trendiest|trickiest|trimmest|truest|trustiest|twitchiest|ugliest|unhappiest|unlikeliest|unluckiest|unruliest|vaguest|vainest|vilest|wackiest|wariest|warmest|wateriest|weakest|wealthiest|weariest|weediest|weirdest|wettest|whitest|wickedest|widest|wiggliest|wildest|windiest|wisest|wispiest|wittiest|wobbliest|wooziest|wordiest|worldliest|worthiest|wriest|wryest|yummiest|zaniest|zestiest|ablest|biggest|bravest|cleverest|fattest|greatest|hottest|kindest|noblest|saddest|smallest|sweetest|whitest|wisest|youngest)\b'), 'ADJ'),
    (re.compile(r'\b(?:most beautiful|most boring|most colorful|most comfortable|most complete|most cruel|most delicious|most difficult|most evil|most expensive|most famous|most foolish|most friendly|most generous|most important|most interesting|most modern|most nervous|most popular|most renowned|most tangled|most tilted|most tired|least energetic)\b'), 'ADJ')
]

common_superlatives = {
    'best', 'worst', 'furthest', 'farthest', 'least', 'most', 'latest', 'last', 'nearest', 'dearest'
}

# Get all sentences from the Brown corpus
sentences = webtext.sents()

In [None]:
# Function to initialize Stanza pipeline
def initialize_pipeline():
    return stanza.Pipeline('en', processors='tokenize,pos,lemma', use_gpu=False)

# Function to process a batch of sentences
def process_sentence_batch(sentences_batch, nlp):
    batch_adjectives = set()
    batch_adverbs = set()
    for sentence in sentences_batch:
        sentence_text = ' '.join(sentence)
        sentence_text_no_punct = re.sub(r'[.,!?;:(){}\[\]\'"@#$%^&*+=|\\/<>\~`]', '', sentence_text)
        found_adjectives = set()
        found_adverbs = set()

        # Use regex patterns to find superlatives
        for pattern, pos in superlative_patterns:
            matches = pattern.findall(sentence_text_no_punct)
            if pos == 'ADJ':
                found_adjectives.update(match.lower() for match in matches)
            elif pos == 'ADV':
                found_adverbs.update(match.lower() for match in matches)

        # Match common superlatives
        found_adjectives.update(word for word in common_superlatives if word in sentence_text_no_punct.lower())

        # If no matches found, use Stanza to find superlatives
        if not found_adjectives and not found_adverbs:
            doc = nlp(sentence_text)
            for sent in doc.sentences:
                for word in sent.words:
                    if word.upos == 'ADJ' and word.feats and 'Degree=Sup' in word.feats:
                        found_adjectives.add(word.text.lower().replace(' ', '-'))
                    elif word.upos == 'ADV' and word.feats and 'Degree=Sup' in word.feats:
                        found_adverbs.add(word.text.lower().replace(' ', '-'))

        batch_adjectives.update(found_adjectives)
        batch_adverbs.update(found_adverbs)

    return batch_adjectives, batch_adverbs

# Function to process all sentences
def process_sentences(sentences, batch_size):
    adjectives = set()
    adverbs = set()

    nlp = initialize_pipeline()

    num_batches = len(sentences) // batch_size + (1 if len(sentences) % batch_size != 0 else 0)
    for i in range(num_batches):
        batch = sentences[i*batch_size:(i+1)*batch_size]
        batch_adjectives, batch_adverbs = process_sentence_batch(batch, nlp)
        adjectives.update(batch_adjectives)
        adverbs.update(batch_adverbs)

    return adjectives, adverbs

# Record start time
start_time = time.time()

# Process sentences
batch_size = 4096
superlative_adjectives, superlative_adverbs = process_sentences(sentences, batch_size)

# Record end time
end_time = time.time()

# Calculate execution time
execution_time = end_time - start_time

# Print execution time
print("\nExecution Time:", execution_time, "seconds")

df_webtext0_superlative_adjectives = pd.DataFrame(list(superlative_adjectives), columns=['Word'])
df_webtext0_superlative_adverbs = pd.DataFrame(list(superlative_adverbs), columns=['Word'])

print("Superlative Adjectives:")
print(df_webtext0_superlative_adjectives.head(20))

print("\nSuperlative Adverbs:")
print(df_webtext0_superlative_adverbs.head(20))

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!



Execution Time: 3493.663732767105 seconds
Superlative Adjectives:
              Word
0       least give
1        least not
2     best crashes
3          nearest
4       least five
5           newest
6       worst ones
7          least 3
8     most complex
9      most recent
10  most expensive
11    least finish
12        least we
13         least a
14         longest
15  most endearing
16        best pot
17        cheapest
18     most people
19        worstest

Superlative Adverbs:
            Word
0  least  lovely
1   most awfully
2      most holy


In [None]:
# save to csv
output_dir = '/content/drive/My Drive/'
df_webtext0_superlative_adjectives = pd.DataFrame(list(superlative_adjectives), columns=['Word'])
df_webtext0_superlative_adverbs = pd.DataFrame(list(superlative_adverbs), columns=['Word'])

df_webtext0_superlative_adjectives.to_csv(os.path.join(output_dir, 'webtext0_superlative_adjectives.csv'), index=False)
df_webtext0_superlative_adverbs.to_csv(os.path.join(output_dir, 'webtext0_superlative_adverbs.csv'), index=False)

print("Superlative Adjectives saved to Google Drive.")
print("Superlative Adverbs saved to Google Drive.")

Superlative Adjectives saved to Google Drive.
Superlative Adverbs saved to Google Drive.


# Reuters

In [None]:
# Define superlative patterns
superlative_patterns = [
    (re.compile(r'\b\w+est\b'), 'ADJ'),
    (re.compile(r'\b\w+iest\b'), 'ADJ'),
    (re.compile(r'\b(?:most|least|best|worst)\s+\w+\b'), 'ADJ'),
    (re.compile(r'\b(?:most|least|best|worst)\s+\w+ly\b'), 'ADV'),
    (re.compile(r'\b(?:foremost|hindmost|inmost|innermost|nethermost|outmost|outermost|topmost|undermost|upmost|uppermost|utmost|uttermost)\b'), 'ADJ'),
    (re.compile(r'\b(?:ablest|angriest|baldest|battiest|beadiest|bitterest|blackest|blandest|blankest|bleakest|blondest|bloodiest|bluest|bluntest|blurriest|boldest|bossiest|bounciest|brainiest|brashest|brassiest|bravest|brawniest|breeziest|briefest|brightest|briskest|broadest|brownest|bubbliest|bulkiest|bumpiest|burliest|bushiest|busiest|calmest|chattiest|cheapest|cheekiest|cheeriest|chewiest|chilliest|choicest|choppiest|chubbiest|chunkiest|clammiest|classiest|cleanest|clearest|cleverest|closest|cloudiest|clumsiest|coarsest|coldest|coolest|corniest|coziest|crabbiest|craftiest|crankiest|craziest|creakiest|creamiest|creepiest|crispest|crispiest|crudest|cruelest|crumbliest|crunchiest|crustiest|cuddliest|curliest|curviest|cutest|daffiest|daintiest|dampest|dandiest|darkest|deadliest|deepest|densest|dingiest|dirtiest|dizziest|dreamiest|dreariest|dressiest|driest|droopiest|drowsiest|dullest|dumbest|dustiest|earliest|easiest|edgiest|eeriest|emptiest|evilest|faintest|fairest|falsest|fanciest|fattest|faultiest|feeblest|fewest|ficklest|fiercest|fieriest|filmiest|filthiest|finest|firmest|fittest|flabbiest|flakiest|flashiest|flattest|flimsiest|floppiest|floweriest|fluffiest|foamiest|foggiest|fondest|foolhardiest|frailest|frankest|freakiest|freest|freshest|friendliest|frilliest|friskiest|frostiest|frothiest|fruitiest|frumpiest|fullest|funniest|furriest|fussiest|fuzziest|gabbiest|gaudiest|gauntest|gawkiest|gentlest|ghastliest|giddiest|glassiest|gloomiest|glossiest|goofiest|grainiest|grandest|gravest|greasiest|greatest|greediest|greenest|grimiest|grittiest|groggiest|grossest|grouchiest|grubbiest|gruffest|grumpiest|guiltiest|gustiest|gutsiest|hairiest|handiest|handsomest|happiest|hardest|hardiest|harshest|hastiest|haughtiest|haziest|healthiest|heartiest|heaviest|heftiest|highest|hippest|hoarsest|hollowest|homeliest|hottest|hugest|humblest|hungriest|huskiest|iciest|ickiest|itchiest|itty-bittiest|jazziest|jerkiest|jolliest|juiciest|kindest|kindliest|kingliest|knobbiest|knottiest|laciest|largest|laziest|leanest|lengthiest|lightest|likeliest|littlest|liveliest|loneliest|longest|loosest|loudest|lousiest|loveliest|lowest|lowliest|luckiest|lumpiest|maddest|meanest|meekest|mellowest|merriest|messiest|mightiest|mildest|mistiest|moistest|moldiest|moodiest|muddiest|muggiest|murkiest|mushiest|narrowest|nastiest|naughtiest|neatest|neediest|newest|nicest|niftiest|nimblest|noblest|noisiest|nosiest|numbest|nuttiest|obscurest|oddest|oiliest|oldest|orneriest|palest|paltriest|perkiest|pettiest|pinkest|plainest|pleasantest|pluckiest|plumpest|plushest|politest|poorest|portliest|prettiest|prickliest|primmest|prissiest|promptest|proudest|puffiest|puniest|purest|pushiest|quaintest|queasiest|queenliest|quickest|quietest|quirkiest|rainiest|rarest|rashest|raspiest|rattiest|rawest|reddest|remotest|richest|ripest|riskiest|ritziest|roomiest|rosiest|rottenest|roughest|roundest|rudest|rustiest|saddest|safest|saintliest|saltiest|sandiest|sanest|sappiest|sassiest|sauciest|scaliest|scantiest|scarcest|scariest|scraggliest|scrappiest|scratchiest|scrawniest|scruffiest|scummiest|securest|seediest|seemliest|serenest|severest|shabbiest|shadiest|shaggiest|shakiest|shallowest|sharpest|shiest|shiftiest|shiniest|shoddiest|shortest|showiest|shrewdest|shrillest|shyest|sickest|sickliest|silkiest|silliest|simplest|sincerest|sketchiest|skimpiest|skinniest|sleekest|sleepiest|slickest|sliest|slightest|slimiest|slimmest|slipperiest|sloppiest|slowest|smallest|smartest|smelliest|smoggiest|smokiest|smoothest|snappiest|sneakiest|snootiest|snottiest|snuggest|softest|soggiest|soonest|sorest|sorriest|sourest|sparsest|speediest|spiciest|spiffiest|spikiest|spookiest|spriest|spryest|squarest|squiggliest|stalest|starkest|stateliest|staunchest|steadiest|steepest|sternest|stickiest|stiffest|stillest|stingiest|stodgiest|stormiest|straggliest|straightest|strangest|strictest|strongest|stubbiest|stuffiest|sturdiest|subtlest|sulkiest|sunniest|surest|surliest|swankiest|sweatiest|sweetest|swiftest|tackiest|tallest|tamest|tangiest|tannest|tardiest|tartest|tastiest|tautest|teeniest|teensiest|teeny-tiniest|tersest|testiest|thickest|thinnest|thirstiest|thorniest|thriftiest|tidiest|tightest|timeliest|tiniest|toothiest|toughest|trashiest|trendiest|trickiest|trimmest|truest|trustiest|twitchiest|ugliest|unhappiest|unlikeliest|unluckiest|unruliest|vaguest|vainest|vilest|wackiest|wariest|warmest|wateriest|weakest|wealthiest|weariest|weediest|weirdest|wettest|whitest|wickedest|widest|wiggliest|wildest|windiest|wisest|wispiest|wittiest|wobbliest|wooziest|wordiest|worldliest|worthiest|wriest|wryest|yummiest|zaniest|zestiest|ablest|biggest|bravest|cleverest|fattest|greatest|hottest|kindest|noblest|saddest|smallest|sweetest|whitest|wisest|youngest)\b'), 'ADJ'),
    (re.compile(r'\b(?:most beautiful|most boring|most colorful|most comfortable|most complete|most cruel|most delicious|most difficult|most evil|most expensive|most famous|most foolish|most friendly|most generous|most important|most interesting|most modern|most nervous|most popular|most renowned|most tangled|most tilted|most tired|least energetic)\b'), 'ADJ')
]

common_superlatives = {
    'best', 'worst', 'furthest', 'farthest', 'least', 'most', 'latest', 'last', 'nearest', 'dearest'
}

# Get all sentences from the Reuters corpus
sentences = reuters.sents()

In [None]:
# Function to initialize Stanza pipeline
def initialize_pipeline():
    return stanza.Pipeline('en', processors='tokenize,pos,lemma', use_gpu=False)

# Function to process a batch of sentences
def process_sentence_batch(sentences_batch, nlp):
    batch_adjectives = set()
    batch_adverbs = set()
    for sentence in sentences_batch:
        sentence_text = ' '.join(sentence)
        sentence_text_no_punct = re.sub(r'[.,!?;:(){}\[\]\'"@#$%^&*+=|\\/<>\~`]', '', sentence_text)
        found_adjectives = set()
        found_adverbs = set()

        # Use regex patterns to find superlatives
        for pattern, pos in superlative_patterns:
            matches = pattern.findall(sentence_text_no_punct)
            if pos == 'ADJ':
                found_adjectives.update(match.lower() for match in matches)
            elif pos == 'ADV':
                found_adverbs.update(match.lower() for match in matches)

        # Match common superlatives
        found_adjectives.update(word for word in common_superlatives if word in sentence_text_no_punct.lower())

        # If no matches found, use Stanza to find superlatives
        if not found_adjectives and not found_adverbs:
            doc = nlp(sentence_text)
            for sent in doc.sentences:
                for word in sent.words:
                    if word.upos == 'ADJ' and word.feats and 'Degree=Sup' in word.feats:
                        found_adjectives.add(word.text.lower().replace(' ', '-'))
                    elif word.upos == 'ADV' and word.feats and 'Degree=Sup' in word.feats:
                        found_adverbs.add(word.text.lower().replace(' ', '-'))

        batch_adjectives.update(found_adjectives)
        batch_adverbs.update(found_adverbs)

    return batch_adjectives, batch_adverbs

# Function to process all sentences
def process_sentences(sentences, batch_size):
    adjectives = set()
    adverbs = set()

    nlp = initialize_pipeline()

    num_batches = len(sentences) // batch_size + (1 if len(sentences) % batch_size != 0 else 0)
    for i in range(num_batches):
        batch = sentences[i*batch_size:(i+1)*batch_size]
        batch_adjectives, batch_adverbs = process_sentence_batch(batch, nlp)
        adjectives.update(batch_adjectives)
        adverbs.update(batch_adverbs)

    return adjectives, adverbs

# Record start time
start_time = time.time()

# Process sentences
batch_size = 4096
superlative_adjectives, superlative_adverbs = process_sentences(sentences, batch_size)

# Record end time
end_time = time.time()

# Calculate execution time
execution_time = end_time - start_time

# Print execution time
print("\nExecution Time:", execution_time, "seconds")

df_webtext_superlative_adjectives = pd.DataFrame(list(superlative_adjectives), columns=['Word'])
df_webtext_superlative_adverbs = pd.DataFrame(list(superlative_adverbs), columns=['Word'])

print("Superlative Adjectives:")
print(df_webtext_superlative_adjectives.head(20))

print("\nSuperlative Adverbs:")
print(df_webtext_superlative_adverbs.head(20))

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!



Execution Time: 15938.22265958786 seconds
Superlative Adjectives:
               Word
0      most closely
1      most private
2       most senior
3            newest
4         southwest
5     least another
6         least  it
7   most persuasive
8    most remaining
9       most recent
10     most reserve
11         most for
12   most expensive
13         steepest
14         least we
15        most fall
16      most liquid
17    most rainfall
18        worst for
19       worst year

Superlative Adverbs:
                   Word
0          most closely
1   least provisionally
2         most commonly
3      most politically
4         most unlikely
5       least initially
6         most probably
7          least likely
8        most seriously
9     most economically
10         most notably
11     most effectively
12            most only
13           least july
14        most actively
15      least partially
16          most likely
17        most strongly
18              highest


In [None]:
output_dir = '/content/drive/My Drive/'
df_reuters_superlative_adjectives = pd.DataFrame(list(superlative_adjectives), columns=['Word'])
df_reuters_superlative_adverbs = pd.DataFrame(list(superlative_adverbs), columns=['Word'])

df_reuters_superlative_adjectives.to_csv(os.path.join(output_dir, 'reuters_superlative_adjectives.csv'), index=False)
df_reuters_superlative_adverbs.to_csv(os.path.join(output_dir, 'reuters_superlative_adverbs.csv'), index=False)

print("Superlative Adjectives saved to Google Drive.")
print("Superlative Adverbs saved to Google Drive.")

Superlative Adjectives saved to Google Drive.
Superlative Adverbs saved to Google Drive.


# Gutenberg

In [None]:
# Define superlative patterns
superlative_patterns = [
    (re.compile(r'\b\w+est\b'), 'ADJ'),
    (re.compile(r'\b\w+iest\b'), 'ADJ'),
    (re.compile(r'\b(?:most|least|best|worst)\s+\w+\b'), 'ADJ'),
    (re.compile(r'\b(?:most|least|best|worst)\s+\w+ly\b'), 'ADV'),
    (re.compile(r'\b(?:foremost|hindmost|inmost|innermost|nethermost|outmost|outermost|topmost|undermost|upmost|uppermost|utmost|uttermost)\b'), 'ADJ'),
    (re.compile(r'\b(?:ablest|angriest|baldest|battiest|beadiest|bitterest|blackest|blandest|blankest|bleakest|blondest|bloodiest|bluest|bluntest|blurriest|boldest|bossiest|bounciest|brainiest|brashest|brassiest|bravest|brawniest|breeziest|briefest|brightest|briskest|broadest|brownest|bubbliest|bulkiest|bumpiest|burliest|bushiest|busiest|calmest|chattiest|cheapest|cheekiest|cheeriest|chewiest|chilliest|choicest|choppiest|chubbiest|chunkiest|clammiest|classiest|cleanest|clearest|cleverest|closest|cloudiest|clumsiest|coarsest|coldest|coolest|corniest|coziest|crabbiest|craftiest|crankiest|craziest|creakiest|creamiest|creepiest|crispest|crispiest|crudest|cruelest|crumbliest|crunchiest|crustiest|cuddliest|curliest|curviest|cutest|daffiest|daintiest|dampest|dandiest|darkest|deadliest|deepest|densest|dingiest|dirtiest|dizziest|dreamiest|dreariest|dressiest|driest|droopiest|drowsiest|dullest|dumbest|dustiest|earliest|easiest|edgiest|eeriest|emptiest|evilest|faintest|fairest|falsest|fanciest|fattest|faultiest|feeblest|fewest|ficklest|fiercest|fieriest|filmiest|filthiest|finest|firmest|fittest|flabbiest|flakiest|flashiest|flattest|flimsiest|floppiest|floweriest|fluffiest|foamiest|foggiest|fondest|foolhardiest|frailest|frankest|freakiest|freest|freshest|friendliest|frilliest|friskiest|frostiest|frothiest|fruitiest|frumpiest|fullest|funniest|furriest|fussiest|fuzziest|gabbiest|gaudiest|gauntest|gawkiest|gentlest|ghastliest|giddiest|glassiest|gloomiest|glossiest|goofiest|grainiest|grandest|gravest|greasiest|greatest|greediest|greenest|grimiest|grittiest|groggiest|grossest|grouchiest|grubbiest|gruffest|grumpiest|guiltiest|gustiest|gutsiest|hairiest|handiest|handsomest|happiest|hardest|hardiest|harshest|hastiest|haughtiest|haziest|healthiest|heartiest|heaviest|heftiest|highest|hippest|hoarsest|hollowest|homeliest|hottest|hugest|humblest|hungriest|huskiest|iciest|ickiest|itchiest|itty-bittiest|jazziest|jerkiest|jolliest|juiciest|kindest|kindliest|kingliest|knobbiest|knottiest|laciest|largest|laziest|leanest|lengthiest|lightest|likeliest|littlest|liveliest|loneliest|longest|loosest|loudest|lousiest|loveliest|lowest|lowliest|luckiest|lumpiest|maddest|meanest|meekest|mellowest|merriest|messiest|mightiest|mildest|mistiest|moistest|moldiest|moodiest|muddiest|muggiest|murkiest|mushiest|narrowest|nastiest|naughtiest|neatest|neediest|newest|nicest|niftiest|nimblest|noblest|noisiest|nosiest|numbest|nuttiest|obscurest|oddest|oiliest|oldest|orneriest|palest|paltriest|perkiest|pettiest|pinkest|plainest|pleasantest|pluckiest|plumpest|plushest|politest|poorest|portliest|prettiest|prickliest|primmest|prissiest|promptest|proudest|puffiest|puniest|purest|pushiest|quaintest|queasiest|queenliest|quickest|quietest|quirkiest|rainiest|rarest|rashest|raspiest|rattiest|rawest|reddest|remotest|richest|ripest|riskiest|ritziest|roomiest|rosiest|rottenest|roughest|roundest|rudest|rustiest|saddest|safest|saintliest|saltiest|sandiest|sanest|sappiest|sassiest|sauciest|scaliest|scantiest|scarcest|scariest|scraggliest|scrappiest|scratchiest|scrawniest|scruffiest|scummiest|securest|seediest|seemliest|serenest|severest|shabbiest|shadiest|shaggiest|shakiest|shallowest|sharpest|shiest|shiftiest|shiniest|shoddiest|shortest|showiest|shrewdest|shrillest|shyest|sickest|sickliest|silkiest|silliest|simplest|sincerest|sketchiest|skimpiest|skinniest|sleekest|sleepiest|slickest|sliest|slightest|slimiest|slimmest|slipperiest|sloppiest|slowest|smallest|smartest|smelliest|smoggiest|smokiest|smoothest|snappiest|sneakiest|snootiest|snottiest|snuggest|softest|soggiest|soonest|sorest|sorriest|sourest|sparsest|speediest|spiciest|spiffiest|spikiest|spookiest|spriest|spryest|squarest|squiggliest|stalest|starkest|stateliest|staunchest|steadiest|steepest|sternest|stickiest|stiffest|stillest|stingiest|stodgiest|stormiest|straggliest|straightest|strangest|strictest|strongest|stubbiest|stuffiest|sturdiest|subtlest|sulkiest|sunniest|surest|surliest|swankiest|sweatiest|sweetest|swiftest|tackiest|tallest|tamest|tangiest|tannest|tardiest|tartest|tastiest|tautest|teeniest|teensiest|teeny-tiniest|tersest|testiest|thickest|thinnest|thirstiest|thorniest|thriftiest|tidiest|tightest|timeliest|tiniest|toothiest|toughest|trashiest|trendiest|trickiest|trimmest|truest|trustiest|twitchiest|ugliest|unhappiest|unlikeliest|unluckiest|unruliest|vaguest|vainest|vilest|wackiest|wariest|warmest|wateriest|weakest|wealthiest|weariest|weediest|weirdest|wettest|whitest|wickedest|widest|wiggliest|wildest|windiest|wisest|wispiest|wittiest|wobbliest|wooziest|wordiest|worldliest|worthiest|wriest|wryest|yummiest|zaniest|zestiest|ablest|biggest|bravest|cleverest|fattest|greatest|hottest|kindest|noblest|saddest|smallest|sweetest|whitest|wisest|youngest)\b'), 'ADJ'),
    (re.compile(r'\b(?:most beautiful|most boring|most colorful|most comfortable|most complete|most cruel|most delicious|most difficult|most evil|most expensive|most famous|most foolish|most friendly|most generous|most important|most interesting|most modern|most nervous|most popular|most renowned|most tangled|most tilted|most tired|least energetic)\b'), 'ADJ')
]

common_superlatives = {
    'best', 'worst', 'furthest', 'farthest', 'least', 'most', 'latest', 'last', 'nearest', 'dearest'
}

# Get all sentences from the Gutenberg corpus
sentences = reuters.sents()

In [None]:
# Function to initialize Stanza pipeline
def initialize_pipeline():
    return stanza.Pipeline('en', processors='tokenize,pos,lemma', use_gpu=False)

# Function to process a batch of sentences
def process_sentence_batch(sentences_batch, nlp):
    batch_adjectives = set()
    batch_adverbs = set()
    for sentence in sentences_batch:
        sentence_text = ' '.join(sentence)
        sentence_text_no_punct = re.sub(r'[.,!?;:(){}\[\]\'"@#$%^&*+=|\\/<>\~`]', '', sentence_text)
        found_adjectives = set()
        found_adverbs = set()

        # Use regex patterns to find superlatives
        for pattern, pos in superlative_patterns:
            matches = pattern.findall(sentence_text_no_punct)
            if pos == 'ADJ':
                found_adjectives.update(match.lower() for match in matches)
            elif pos == 'ADV':
                found_adverbs.update(match.lower() for match in matches)

        # Match common superlatives
        found_adjectives.update(word for word in common_superlatives if word in sentence_text_no_punct.lower())

        # If no matches found, use Stanza to find superlatives
        if not found_adjectives and not found_adverbs:
            doc = nlp(sentence_text)
            for sent in doc.sentences:
                for word in sent.words:
                    if word.upos == 'ADJ' and word.feats and 'Degree=Sup' in word.feats:
                        found_adjectives.add(word.text.lower().replace(' ', '-'))
                    elif word.upos == 'ADV' and word.feats and 'Degree=Sup' in word.feats:
                        found_adverbs.add(word.text.lower().replace(' ', '-'))

        batch_adjectives.update(found_adjectives)
        batch_adverbs.update(found_adverbs)

    return batch_adjectives, batch_adverbs

# Function to process all sentences
def process_sentences(sentences, batch_size):
    adjectives = set()
    adverbs = set()

    nlp = initialize_pipeline()

    num_batches = len(sentences) // batch_size + (1 if len(sentences) % batch_size != 0 else 0)
    for i in range(num_batches):
        batch = sentences[i*batch_size:(i+1)*batch_size]
        batch_adjectives, batch_adverbs = process_sentence_batch(batch, nlp)
        adjectives.update(batch_adjectives)
        adverbs.update(batch_adverbs)

    return adjectives, adverbs

# Record start time
start_time = time.time()

# Process sentences
batch_size = 4096
superlative_adjectives, superlative_adverbs = process_sentences(sentences, batch_size)

# Record end time
end_time = time.time()

# Calculate execution time
execution_time = end_time - start_time

# Print execution time
print("\nExecution Time:", execution_time, "seconds")

df_webtext_superlative_adjectives = pd.DataFrame(list(superlative_adjectives), columns=['Word'])
df_webtext_superlative_adverbs = pd.DataFrame(list(superlative_adverbs), columns=['Word'])

print("Superlative Adjectives:")
print(df_webtext_superlative_adjectives.head(20))

print("\nSuperlative Adverbs:")
print(df_webtext_superlative_adverbs.head(20))

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!



Execution Time: 14300.987951278687 seconds
Superlative Adjectives:
               Word
0      most closely
1      most private
2       most senior
3            newest
4         southwest
5     least another
6         least  it
7   most persuasive
8    most remaining
9       most recent
10     most reserve
11         most for
12   most expensive
13         steepest
14         least we
15        most fall
16      most liquid
17    most rainfall
18        worst for
19       worst year

Superlative Adverbs:
                   Word
0          most closely
1   least provisionally
2         most commonly
3      most politically
4         most unlikely
5       least initially
6         most probably
7          least likely
8        most seriously
9     most economically
10         most notably
11     most effectively
12            most only
13           least july
14        most actively
15      least partially
16          most likely
17        most strongly
18              highest


In [None]:
output_dir = '/content/drive/My Drive/'
df_gutenberg_superlative_adjectives = pd.DataFrame(list(superlative_adjectives), columns=['Word'])
df_gutenberg_superlative_adverbs = pd.DataFrame(list(superlative_adverbs), columns=['Word'])

df_gutenberg_superlative_adjectives.to_csv(os.path.join(output_dir, 'gutenberg_superlative_adjectives.csv'), index=False)
df_gutenberg_superlative_adverbs.to_csv(os.path.join(output_dir, 'gutenberg_superlative_adverbs.csv'), index=False)

print("Superlative Adjectives saved to Google Drive.")
print("Superlative Adverbs saved to Google Drive.")

Superlative Adjectives saved to Google Drive.
Superlative Adverbs saved to Google Drive.


# Movie_Reviews

In [None]:
# Define superlative patterns
superlative_patterns = [
    (re.compile(r'\b\w+est\b'), 'ADJ'),
    (re.compile(r'\b\w+iest\b'), 'ADJ'),
    (re.compile(r'\b(?:most|least|best|worst)\s+\w+\b'), 'ADJ'),
    (re.compile(r'\b(?:most|least|best|worst)\s+\w+ly\b'), 'ADV'),
    (re.compile(r'\b(?:foremost|hindmost|inmost|innermost|nethermost|outmost|outermost|topmost|undermost|upmost|uppermost|utmost|uttermost)\b'), 'ADJ'),
    (re.compile(r'\b(?:ablest|angriest|baldest|battiest|beadiest|bitterest|blackest|blandest|blankest|bleakest|blondest|bloodiest|bluest|bluntest|blurriest|boldest|bossiest|bounciest|brainiest|brashest|brassiest|bravest|brawniest|breeziest|briefest|brightest|briskest|broadest|brownest|bubbliest|bulkiest|bumpiest|burliest|bushiest|busiest|calmest|chattiest|cheapest|cheekiest|cheeriest|chewiest|chilliest|choicest|choppiest|chubbiest|chunkiest|clammiest|classiest|cleanest|clearest|cleverest|closest|cloudiest|clumsiest|coarsest|coldest|coolest|corniest|coziest|crabbiest|craftiest|crankiest|craziest|creakiest|creamiest|creepiest|crispest|crispiest|crudest|cruelest|crumbliest|crunchiest|crustiest|cuddliest|curliest|curviest|cutest|daffiest|daintiest|dampest|dandiest|darkest|deadliest|deepest|densest|dingiest|dirtiest|dizziest|dreamiest|dreariest|dressiest|driest|droopiest|drowsiest|dullest|dumbest|dustiest|earliest|easiest|edgiest|eeriest|emptiest|evilest|faintest|fairest|falsest|fanciest|fattest|faultiest|feeblest|fewest|ficklest|fiercest|fieriest|filmiest|filthiest|finest|firmest|fittest|flabbiest|flakiest|flashiest|flattest|flimsiest|floppiest|floweriest|fluffiest|foamiest|foggiest|fondest|foolhardiest|frailest|frankest|freakiest|freest|freshest|friendliest|frilliest|friskiest|frostiest|frothiest|fruitiest|frumpiest|fullest|funniest|furriest|fussiest|fuzziest|gabbiest|gaudiest|gauntest|gawkiest|gentlest|ghastliest|giddiest|glassiest|gloomiest|glossiest|goofiest|grainiest|grandest|gravest|greasiest|greatest|greediest|greenest|grimiest|grittiest|groggiest|grossest|grouchiest|grubbiest|gruffest|grumpiest|guiltiest|gustiest|gutsiest|hairiest|handiest|handsomest|happiest|hardest|hardiest|harshest|hastiest|haughtiest|haziest|healthiest|heartiest|heaviest|heftiest|highest|hippest|hoarsest|hollowest|homeliest|hottest|hugest|humblest|hungriest|huskiest|iciest|ickiest|itchiest|itty-bittiest|jazziest|jerkiest|jolliest|juiciest|kindest|kindliest|kingliest|knobbiest|knottiest|laciest|largest|laziest|leanest|lengthiest|lightest|likeliest|littlest|liveliest|loneliest|longest|loosest|loudest|lousiest|loveliest|lowest|lowliest|luckiest|lumpiest|maddest|meanest|meekest|mellowest|merriest|messiest|mightiest|mildest|mistiest|moistest|moldiest|moodiest|muddiest|muggiest|murkiest|mushiest|narrowest|nastiest|naughtiest|neatest|neediest|newest|nicest|niftiest|nimblest|noblest|noisiest|nosiest|numbest|nuttiest|obscurest|oddest|oiliest|oldest|orneriest|palest|paltriest|perkiest|pettiest|pinkest|plainest|pleasantest|pluckiest|plumpest|plushest|politest|poorest|portliest|prettiest|prickliest|primmest|prissiest|promptest|proudest|puffiest|puniest|purest|pushiest|quaintest|queasiest|queenliest|quickest|quietest|quirkiest|rainiest|rarest|rashest|raspiest|rattiest|rawest|reddest|remotest|richest|ripest|riskiest|ritziest|roomiest|rosiest|rottenest|roughest|roundest|rudest|rustiest|saddest|safest|saintliest|saltiest|sandiest|sanest|sappiest|sassiest|sauciest|scaliest|scantiest|scarcest|scariest|scraggliest|scrappiest|scratchiest|scrawniest|scruffiest|scummiest|securest|seediest|seemliest|serenest|severest|shabbiest|shadiest|shaggiest|shakiest|shallowest|sharpest|shiest|shiftiest|shiniest|shoddiest|shortest|showiest|shrewdest|shrillest|shyest|sickest|sickliest|silkiest|silliest|simplest|sincerest|sketchiest|skimpiest|skinniest|sleekest|sleepiest|slickest|sliest|slightest|slimiest|slimmest|slipperiest|sloppiest|slowest|smallest|smartest|smelliest|smoggiest|smokiest|smoothest|snappiest|sneakiest|snootiest|snottiest|snuggest|softest|soggiest|soonest|sorest|sorriest|sourest|sparsest|speediest|spiciest|spiffiest|spikiest|spookiest|spriest|spryest|squarest|squiggliest|stalest|starkest|stateliest|staunchest|steadiest|steepest|sternest|stickiest|stiffest|stillest|stingiest|stodgiest|stormiest|straggliest|straightest|strangest|strictest|strongest|stubbiest|stuffiest|sturdiest|subtlest|sulkiest|sunniest|surest|surliest|swankiest|sweatiest|sweetest|swiftest|tackiest|tallest|tamest|tangiest|tannest|tardiest|tartest|tastiest|tautest|teeniest|teensiest|teeny-tiniest|tersest|testiest|thickest|thinnest|thirstiest|thorniest|thriftiest|tidiest|tightest|timeliest|tiniest|toothiest|toughest|trashiest|trendiest|trickiest|trimmest|truest|trustiest|twitchiest|ugliest|unhappiest|unlikeliest|unluckiest|unruliest|vaguest|vainest|vilest|wackiest|wariest|warmest|wateriest|weakest|wealthiest|weariest|weediest|weirdest|wettest|whitest|wickedest|widest|wiggliest|wildest|windiest|wisest|wispiest|wittiest|wobbliest|wooziest|wordiest|worldliest|worthiest|wriest|wryest|yummiest|zaniest|zestiest|ablest|biggest|bravest|cleverest|fattest|greatest|hottest|kindest|noblest|saddest|smallest|sweetest|whitest|wisest|youngest)\b'), 'ADJ'),
    (re.compile(r'\b(?:most beautiful|most boring|most colorful|most comfortable|most complete|most cruel|most delicious|most difficult|most evil|most expensive|most famous|most foolish|most friendly|most generous|most important|most interesting|most modern|most nervous|most popular|most renowned|most tangled|most tilted|most tired|least energetic)\b'), 'ADJ')
]

common_superlatives = {
    'best', 'worst', 'furthest', 'farthest', 'least', 'most', 'latest', 'last', 'nearest', 'dearest'
}

# Get all sentences from the movie_reviews corpus
sentences = reuters.sents()

In [None]:
# Function to initialize Stanza pipeline
def initialize_pipeline():
    return stanza.Pipeline('en', processors='tokenize,pos,lemma', use_gpu=False)

# Function to process a batch of sentences
def process_sentence_batch(sentences_batch, nlp):
    batch_adjectives = set()
    batch_adverbs = set()
    for sentence in sentences_batch:
        sentence_text = ' '.join(sentence)
        sentence_text_no_punct = re.sub(r'[.,!?;:(){}\[\]\'"@#$%^&*+=|\\/<>\~`]', '', sentence_text)
        found_adjectives = set()
        found_adverbs = set()

        # Use regex patterns to find superlatives
        for pattern, pos in superlative_patterns:
            matches = pattern.findall(sentence_text_no_punct)
            if pos == 'ADJ':
                found_adjectives.update(match.lower() for match in matches)
            elif pos == 'ADV':
                found_adverbs.update(match.lower() for match in matches)

        # Match common superlatives
        found_adjectives.update(word for word in common_superlatives if word in sentence_text_no_punct.lower())

        # If no matches found, use Stanza to find superlatives
        if not found_adjectives and not found_adverbs:
            doc = nlp(sentence_text)
            for sent in doc.sentences:
                for word in sent.words:
                    if word.upos == 'ADJ' and word.feats and 'Degree=Sup' in word.feats:
                        found_adjectives.add(word.text.lower().replace(' ', '-'))
                    elif word.upos == 'ADV' and word.feats and 'Degree=Sup' in word.feats:
                        found_adverbs.add(word.text.lower().replace(' ', '-'))

        batch_adjectives.update(found_adjectives)
        batch_adverbs.update(found_adverbs)

    return batch_adjectives, batch_adverbs

# Function to process all sentences
def process_sentences(sentences, batch_size):
    adjectives = set()
    adverbs = set()

    nlp = initialize_pipeline()

    num_batches = len(sentences) // batch_size + (1 if len(sentences) % batch_size != 0 else 0)
    for i in range(num_batches):
        batch = sentences[i*batch_size:(i+1)*batch_size]
        batch_adjectives, batch_adverbs = process_sentence_batch(batch, nlp)
        adjectives.update(batch_adjectives)
        adverbs.update(batch_adverbs)

    return adjectives, adverbs

# Record start time
start_time = time.time()

# Process sentences
batch_size = 4096
superlative_adjectives, superlative_adverbs = process_sentences(sentences, batch_size)

# Record end time
end_time = time.time()

# Calculate execution time
execution_time = end_time - start_time

# Print execution time
print("\nExecution Time:", execution_time, "seconds")

df_movie_reviews_superlative_adjectives = pd.DataFrame(list(superlative_adjectives), columns=['Word'])
df_movie_reviews_superlative_adverbs = pd.DataFrame(list(superlative_adverbs), columns=['Word'])

print("Superlative Adjectives:")
print(df_movie_reviews_superlative_adjectives.head(20))

print("\nSuperlative Adverbs:")
print(df_movie_reviews_superlative_adverbs.head(20))

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!



Execution Time: 13769.225402593613 seconds
Superlative Adjectives:
               Word
0      most closely
1      most private
2       most senior
3            newest
4         southwest
5     least another
6         least  it
7   most persuasive
8    most remaining
9       most recent
10     most reserve
11         most for
12   most expensive
13         steepest
14         least we
15        most fall
16      most liquid
17    most rainfall
18        worst for
19       worst year

Superlative Adverbs:
                   Word
0          most closely
1   least provisionally
2         most commonly
3      most politically
4         most unlikely
5       least initially
6         most probably
7          least likely
8        most seriously
9     most economically
10         most notably
11     most effectively
12            most only
13           least july
14        most actively
15      least partially
16          most likely
17        most strongly
18              highest


In [None]:

output_dir = '/content/drive/My Drive/'
df_movie_reviews_superlative_adjectives = pd.DataFrame(list(superlative_adjectives), columns=['Word'])
df_movie_reviews_superlative_adverbs = pd.DataFrame(list(superlative_adverbs), columns=['Word'])

df_movie_reviews_superlative_adjectives.to_csv(os.path.join(output_dir, 'movie_reviews_superlative_adjectives.csv'), index=False)
df_movie_reviews_superlative_adverbs.to_csv(os.path.join(output_dir, 'movie_reviews_superlative_adverbs.csv'), index=False)

print("Superlative Adjectives saved to Google Drive.")
print("Superlative Adverbs saved to Google Drive.")

Superlative Adjectives saved to Google Drive.
Superlative Adverbs saved to Google Drive.
