In [1]:
import os
import requests
import zipfile
import json
from tqdm import tqdm
import nltk
import spacy

In [2]:
os.makedirs("../external_datasets/lexicons", exist_ok=True)
os.makedirs("../external_datasets/corpora", exist_ok=True)

In [3]:
HEDGES_URL = "https://raw.githubusercontent.com/words/hedges/master/data.txt"
r = requests.get(HEDGES_URL)
with open("../external_datasets/lexicons/hedges.txt", "w", encoding="utf-8") as f:
    f.write(r.text)
print("Downloaded hedge lexicon (hedges.txt) with",
      len(r.text.splitlines()), "entries.")

Downloaded hedge lexicon (hedges.txt) with 193 entries.


In [4]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('gutenberg')
nltk.download('brown')
nltk.download('reuters')
nltk.download('inaugural')
print("Downloaded NLTK corpora (gutenberg, brown, reuters, inaugural).")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data] Downloading package inaugural to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\inaugural.zip.


Downloaded NLTK corpora (gutenberg, brown, reuters, inaugural).


In [5]:
from nltk.corpus import gutenberg, brown, reuters, inaugural

def dump_corpus(corpus, name):
    out_path = os.path.join("../external_datasets/corpora", f"{name}.txt")
    with open(out_path, "w", encoding="utf-8") as fout:
        for fileid in corpus.fileids():
            text = corpus.raw(fileid)
            fout.write(text + "\n\n")
    print(f"Dumped {name} corpus to {out_path}.")


dump_corpus(gutenberg, "gutenberg")
dump_corpus(brown, "brown")
dump_corpus(reuters, "reuters")
dump_corpus(inaugural, "inaugural")

Dumped gutenberg corpus to ../external_datasets/corpora\gutenberg.txt.
Dumped brown corpus to ../external_datasets/corpora\brown.txt.
Dumped reuters corpus to ../external_datasets/corpora\reuters.txt.
Dumped inaugural corpus to ../external_datasets/corpora\inaugural.txt.


In [6]:
try:
    nlp = spacy.load("en_core_web_sm")
except:
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

print("spaCy English model loaded for NER.")

spaCy English model loaded for NER.


In [7]:
def extract_features(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    tokens = [t.text for t in doc]
    return {
        "entity_count": len(entities),
        "token_count": len(tokens),
        "entities": entities
    }

sample = gutenberg.raw('austen-emma.txt')[:1000]
print("Sample features:", extract_features(sample))

Sample features: {'entity_count': 10, 'token_count': 223, 'entities': ['Jane Austen', '1816', 'Emma Woodhouse', 'nearly twenty-one years', 'two', 'Sixteen years', 'Taylor', 'Woodhouse', 'Emma', 'Taylor']}
