# Seminar 6: Application Configuration & CLI Design + Dependencies, Environments & Packaging

## Application Configuration

### Environment

Environment variables are often used in applications for configuration.
They can be accessed through os.environ

In [92]:
import os
dict(os.environ).keys()

dict_keys(['PATH', 'PYENV_SHELL', 'LC_MEASUREMENT', 'XAUTHORITY', 'LC_TELEPHONE', 'XDG_DATA_DIRS', 'GDMSESSION', 'MANDATORY_PATH', 'DBUS_SESSION_BUS_ADDRESS', 'GATEWAY_VM_OPTIONS', 'RUSTROVER_VM_OPTIONS', 'DEFAULTS_PATH', 'PS1', 'XDG_CURRENT_DESKTOP', 'RIDER_VM_OPTIONS', 'LC_PAPER', 'SESSION_MANAGER', 'DEVECOSTUDIO_VM_OPTIONS', 'LOGNAME', 'PWD', 'STUDIO_VM_OPTIONS', 'LANGUAGE', 'GJS_DEBUG_TOPICS', 'PYTHONPATH', 'SHELL', 'LC_ADDRESS', 'GIO_LAUNCHED_DESKTOP_FILE', 'PYENV_ROOT', 'GNOME_DESKTOP_SESSION_ID', 'DATASPELL_VM_OPTIONS', 'GTK_MODULES', 'VIRTUAL_ENV', 'XDG_SESSION_PATH', 'LC_ALL', 'DOTNET_BUNDLE_EXTRACT_BASE_DIR', 'XDG_SESSION_DESKTOP', 'SHLVL', 'LC_IDENTIFICATION', 'DATAGRIP_VM_OPTIONS', 'LC_MONETARY', 'PYCHARM_VM_OPTIONS', 'WEBSTORM_VM_OPTIONS', 'JAVA_HOME', 'CLION_VM_OPTIONS', 'JETBRAINSCLIENT_VM_OPTIONS', 'XDG_CONFIG_DIRS', 'LANG', 'XDG_SEAT_PATH', 'XDG_SESSION_ID', 'XDG_SESSION_TYPE', 'ANT_HOME', 'DISPLAY', 'GOLAND_VM_OPTIONS', 'IDEA_VM_OPTIONS', 'RUBYMINE_VM_OPTIONS', 'AQUA_

In [9]:
# Some of the most common environment variables (in linux)
print('PATH:', os.environ['PATH']) # all executables accessible from anywhere
print('PYTHONPATH:', os.environ['PYTHONPATH']) # python source path
print('PWD:', os.environ['PWD']) # present working directory
print('HOME:', os.environ['HOME']) # home directory
print('USER:', os.environ['USER']) # user
print('Non existing: ', os.environ['NON_EXISTING']) # non existing key

PATH: /home/fullfix/Documents/Msu/DataSpell/python3.10/bin:/home/fullfix/.pyenv/shims:/home/fullfix/.pyenv/bin:/opt/gcc-arm-none-eabi-9-2020-q2-update/bin:/home/fullfix/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/fullfix/.dotnet/tools:/opt/jdk-14/bin:/home/fullfix/.local/share/JetBrains/Toolbox/scripts
PYTHONPATH: /home/fullfix/Documents/Msu/DataSpell/PythonSeminars
PWD: /home/fullfix/Documents/Msu/DataSpell/PythonSeminars
HOME: /home/fullfix
USER: fullfix


KeyError: 'NON_EXISTING'

In [14]:
# bad example
try:
    os.environ['NON_EXISTING']
except KeyError:
    ...

# good example
value = os.environ.get('NON_EXISTING', None)
if value is None:
    ...

Let's try passing environment variable DATASET_URL to a python script and look at different ways to do it.

In [25]:
%%writefile connect_dataset.py
import os


def main():
    dataset_url = os.environ.get('DATASET_URL', None)
    print('DATASET_URL: ', dataset_url)

if __name__ == '__main__':
    main()

Overwriting connect_dataset.py


In [33]:
!python connect_dataset.py

DATASET_URL:  None


In [45]:
# from terminal:
# DATASET_URL=<someurl> python connect_dataset.py

# from jupyter notebook:
import os
os.environ['DATASET_URL'] = "<someurl>"
!python connect_dataset.py

DATASET_URL:  <someurl>


The standard way to store environment variables is in ".env" files (which are always put in .gitignore for security).
Dotnet package allows working with such files easily 

In [104]:
%%capture
!pip install python-dotenv

In [35]:
%%writefile .env
DATASET_URL=somenewdataseturl

Writing .env


In [46]:
%%writefile env_check.py
from dotenv import load_dotenv
import os

print('DATASET_URL:', os.environ.get('DATASET_URL'))
load_dotenv('.env')
print('DATASET_URL:', os.environ.get('DATASET_URL'))

Overwriting env_check.py


In [47]:
!python env_check.py

DATASET_URL: <someurl>
DATASET_URL: <someurl>


Oops: when env variable is already declared, it is not overwritten by dotenv. Let's delete the old one first.

In [48]:
del os.environ['DATASET_URL']

In [49]:
!python env_check.py

DATASET_URL: None
DATASET_URL: somenewdataseturl


### Other ways to load configuration

#### Yaml

In [52]:
%%writefile config.yaml
database:
  url: "postgresql://user:pass@localhost:5432/db"
  timeout: 30
logging:
  level: "DEBUG"

Writing config.yaml


In [105]:
%%capture
!pip install pyyaml

In [54]:
import yaml

with open("config.yaml", 'r') as f:
    config = yaml.safe_load(f)

print(config)
print(config['database']['url'])

{'database': {'url': 'postgresql://user:pass@localhost:5432/db', 'timeout': 30}, 'logging': {'level': 'DEBUG'}}
postgresql://user:pass@localhost:5432/db


#### Json

In [55]:
%%writefile config.json
{
    "database": {
    "url": "postgresql://user:pass@localhost:5432/db",
    "timeout": 30
    },
    "logging": {
        "level": "DEBUG"
    }
}

Writing config.json


In [56]:
import json

with open('config.json', 'r') as f:
    config = json.load(f)

print(config)
print(config['database']['url'])

{'database': {'url': 'postgresql://user:pass@localhost:5432/db', 'timeout': 30}, 'logging': {'level': 'DEBUG'}}
postgresql://user:pass@localhost:5432/db


#### INI

In [58]:
%%writefile config.ini
[database]
url = postgresql://user:pass@localhost:5432/db
timeout = 30

[logging]
level = DEBUG

Writing config.ini


In [60]:
from configparser import ConfigParser

config = ConfigParser()
config.read("config.ini")

print(config)
print(config.get("database", "url"))
print(config.get("logging", "level"))

<configparser.ConfigParser object at 0x77925fac5510>
postgresql://user:pass@localhost:5432/db
DEBUG


# Exercise: Analyzing Words in a Large Text File

We want to write an analyzer that processes given (possibly large) english text file at word level and extracts the following statistics:

1) Total Number of words and unique words (form-independent)
2) Number of words and unique words for each part of speech
3) Number of characters in the longest noun
4) Average "positivity" and "negativity" for the words

Requirement: optional progress bar so that estimated time for processing is visible

An example of the input text files is shakespeare texts, downloaded from github (5 MB)

In [37]:
!wget -O shakespeare.txt "https://gist.githubusercontent.com/blakesanie/dde3a2b7e698f52f389532b4b52bc254/raw/76fe1b5e9efcf0d2afdfd78b0bfaa737ad0a67d3/shakespeare.txt"

--2025-08-09 11:27:06--  https://gist.githubusercontent.com/blakesanie/dde3a2b7e698f52f389532b4b52bc254/raw/76fe1b5e9efcf0d2afdfd78b0bfaa737ad0a67d3/shakespeare.txt
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5436475 (5.2M) [text/plain]
Saving to: ‘shakespeare.txt’


2025-08-09 11:27:11 (1.40 MB/s) - ‘shakespeare.txt’ saved [5436475/5436475]



In [61]:
!head shakespeare.txt

  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,


In [62]:
# Bad example: don't do this with very large files
with open("shakespeare.txt") as f:
    content = f.read()
    ...

In [63]:
len(content) # Very large

5436475

In [64]:
del content # let's delete it to free our memory

For the analysis of words, we are going to use spaCy and nltk --- very popular natural language processing libraries.

In [100]:
%%capture
!pip install spacy nltk

In [102]:
%%capture
!python -m spacy download en_core_web_sm

In [103]:
%%capture
!python -m nltk.downloader vader_lexicon

In [66]:
# Example of using spacy
import spacy

text = "usually , he would be tearing around the living room , playing with his toys ."
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
for token in doc:
    print(f"{token.text:<10} | POS: {token.pos_:<8} | Explanation: {spacy.explain(token.pos_)}")

usually    | POS: ADV      | Explanation: adverb
,          | POS: PUNCT    | Explanation: punctuation
he         | POS: PRON     | Explanation: pronoun
would      | POS: AUX      | Explanation: auxiliary
be         | POS: AUX      | Explanation: auxiliary
tearing    | POS: VERB     | Explanation: verb
around     | POS: ADP      | Explanation: adposition
the        | POS: DET      | Explanation: determiner
living     | POS: NOUN     | Explanation: noun
room       | POS: NOUN     | Explanation: noun
,          | POS: PUNCT    | Explanation: punctuation
playing    | POS: VERB     | Explanation: verb
with       | POS: ADP      | Explanation: adposition
his        | POS: PRON     | Explanation: pronoun
toys       | POS: NOUN     | Explanation: noun
.          | POS: PUNCT    | Explanation: punctuation


In [70]:
# Example of using nltk for sentiment analysis
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

print(sia.polarity_scores(text))

for word in ['ok', 'he', 'good', 'bad', 'strong', 'weak', 'terrible', 'beautiful', 'awful']:
    print(f"Sentiment for word {word}: {sia.polarity_scores(word)}")

{'neg': 0.0, 'neu': 0.87, 'pos': 0.13, 'compound': 0.2023}
Sentiment for word ok: {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.296}
Sentiment for word he: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Sentiment for word good: {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4404}
Sentiment for word bad: {'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.5423}
Sentiment for word strong: {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.5106}
Sentiment for word weak: {'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.4404}
Sentiment for word terrible: {'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.4767}
Sentiment for word beautiful: {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.5994}
Sentiment for word awful: {'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.4588}


First, let's write word data extraction code: enumerate_words(file_path: str)
token_to_word function is already given that processes spaCy's Token and extracts all required data.

In [72]:
from dataclasses import dataclass
from typing import Generator
import spacy
from spacy.tokens import Token
import os
import re


@dataclass(frozen=True)
class Word:
    content: str
    pos_tag: str
    lemma: str | None
    positivity: float
    negativity: float


def token_to_word(token: Token) -> Word | None:
    if token.is_punct:
        return None
    polarity_scores = sia.polarity_scores(token.text)
    return Word(
        content=token.text,
        pos_tag=token.pos_,
        lemma=token.lemma_ if not token.is_punct else None,
        positivity=polarity_scores['pos'],
        negativity=polarity_scores['neg']
    )

In [52]:
def enumerate_words_version1(file_path: str) -> list[Word]:
    nlp = spacy.load("en_core_web_sm")
    
    result = []
    with open(file_path, 'r') as f:
        for line in f:
            for token in nlp(line):
                word = token_to_word(token)
                if word is not None:
                    result.append(word)
    return result

## What's wrong with the first version?

1) Memory inefficient: we don't need to store the list of all words, if we are only doing a for loop.
2) It's better to process text and replace all punctuation marks and non-english letters with spaces.
3) No progress bar: how long will it take? Who knows...

In [None]:
from typing import Iterator

class WordIteratorVersion2:
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.nlp = spacy.load("en_core_web_sm")
        self.file_handle = None
        self.token_iterator = None
    
    def __iter__(self) -> Iterator[Word]:
        self.file_handle = open(self.file_path, 'r')
        return self
    
    def __next__(self) -> Word:
        while True:
            if self.token_iterator is None:
                current_line = self.file_handle.readline()
                
                if not current_line:
                    self.file_handle.close()
                    raise StopIteration
                
                processed_line = re.sub(r'[^a-zA-Z]', ' ', current_line)
                self.token_iterator = iter(self.nlp(processed_line))
            try:
                token = next(self.token_iterator)
                word = token_to_word(token)
                if word is not None:
                    return word
            except StopIteration:
                self.token_iterator = None
    
    def __del__(self):
        if self.file_handle and not self.file_handle.closed:
            self.file_handle.close()

In [79]:
i = 0
for word in WordIteratorVersion2("shakespeare.txt"):
    print(word)
    i += 1
    if i >= 50:
        break

Word(content='  ', pos_tag='SPACE', lemma='  ', positivity=0.0, negativity=0.0)
Word(content='From', pos_tag='ADP', lemma='from', positivity=0.0, negativity=0.0)
Word(content='fairest', pos_tag='ADJ', lemma='fair', positivity=0.0, negativity=0.0)
Word(content='creatures', pos_tag='NOUN', lemma='creature', positivity=0.0, negativity=0.0)
Word(content='we', pos_tag='PRON', lemma='we', positivity=0.0, negativity=0.0)
Word(content='desire', pos_tag='AUX', lemma='desire', positivity=1.0, negativity=0.0)
Word(content='increase', pos_tag='NOUN', lemma='increase', positivity=1.0, negativity=0.0)
Word(content=' ', pos_tag='SPACE', lemma=' ', positivity=0.0, negativity=0.0)
Word(content='  ', pos_tag='SPACE', lemma='  ', positivity=0.0, negativity=0.0)
Word(content='That', pos_tag='SCONJ', lemma='that', positivity=0.0, negativity=0.0)
Word(content='thereby', pos_tag='ADV', lemma='thereby', positivity=0.0, negativity=0.0)
Word(content='beauty', pos_tag='NOUN', lemma='beauty', positivity=1.0, nega

This method is VERY code-intensive. We can do it much easier with Generator

In [75]:
def enumerate_words_version2(file_path: str) -> Generator[Word, None, None]:
    nlp = spacy.load("en_core_web_sm")
    
    with open(file_path, 'r') as f:
        for line in f:
            processed_line = re.sub(r'[^a-zA-Z]', ' ', line)
            for token in nlp(processed_line):
                word = token_to_word(token)
                if word is not None:
                    yield word

In [80]:
i = 0
for word in enumerate_words_version2("shakespeare.txt"):
    print(word)
    i += 1
    if i >= 50:
        break

Word(content='  ', pos_tag='SPACE', lemma='  ', positivity=0.0, negativity=0.0)
Word(content='From', pos_tag='ADP', lemma='from', positivity=0.0, negativity=0.0)
Word(content='fairest', pos_tag='ADJ', lemma='fair', positivity=0.0, negativity=0.0)
Word(content='creatures', pos_tag='NOUN', lemma='creature', positivity=0.0, negativity=0.0)
Word(content='we', pos_tag='PRON', lemma='we', positivity=0.0, negativity=0.0)
Word(content='desire', pos_tag='AUX', lemma='desire', positivity=1.0, negativity=0.0)
Word(content='increase', pos_tag='NOUN', lemma='increase', positivity=1.0, negativity=0.0)
Word(content=' ', pos_tag='SPACE', lemma=' ', positivity=0.0, negativity=0.0)
Word(content='  ', pos_tag='SPACE', lemma='  ', positivity=0.0, negativity=0.0)
Word(content='That', pos_tag='SCONJ', lemma='that', positivity=0.0, negativity=0.0)
Word(content='thereby', pos_tag='ADV', lemma='thereby', positivity=0.0, negativity=0.0)
Word(content='beauty', pos_tag='NOUN', lemma='beauty', positivity=1.0, nega

In [None]:
for word in enumerate_words_version2("shakespeare.txt"):
    ... # How long?

## Adding tqdm

In [73]:
from tqdm.notebook import tqdm
def enumerate_words_version3(file_path: str, pbar: bool = False) -> Generator[Word, None, None]:
    nlp = spacy.load("en_core_web_sm")
    
    if pbar:
        with open(file_path, 'r') as f:
            line_count = sum(1 for line in f)
    
    with open(file_path, 'r') as f:
        line_iterable = tqdm(f, total=line_count) if pbar else f
        for line in line_iterable:
            processed_line = re.sub(r'[^a-zA-Z]', ' ', line)
            for token in nlp(processed_line):
                word = token_to_word(token)
                if word is not None:
                    yield word

In [106]:
try:
    for word in enumerate_words_version3("shakespeare.txt", pbar=True):
        ...
except KeyboardInterrupt:
    print("Interrupted")

  0%|          | 0/124185 [00:00<?, ?it/s]

Interrupted


## Improving speed

Now it takes about 7 minutes for our 5 MB file. Can we make it faster?

The answer is yes!

First, let's recognise the bottleneck of our function. It is nlp(processed_line), which does some complex stuff. Calling it with a larger string is faster than calling multiple times on small strings. So, let's implement batches.

In [81]:
from io import StringIO
import math


def enumerate_batches(file_path: str, batch_size: int) -> Generator[str, None, None]:
    with open(file_path, 'r') as f:
        builder = StringIO()
        counter = 0
        for line in f:
            builder.write(line)
            builder.write(" ")
            counter += len(line)
            if counter >= batch_size:
                yield builder.getvalue()
                builder = StringIO()
                counter = 0
    # important: do not forget the ending when batching
    if counter > 0:
        yield builder.getvalue()


def enumerate_words_version4(file_path: str, batch_size: int, pbar: bool = False) -> Generator[Word, None, None]:
    batch_iterable = enumerate_batches(file_path, batch_size)
    if pbar:
        total_size = os.path.getsize(file_path)
        batch_count = int(math.ceil(total_size / batch_size))
        batch_iterable = tqdm(enumerate_batches(file_path, batch_size), total=batch_count)
    for batch in batch_iterable:
        processed_batch = re.sub(r'[^a-zA-Z]', ' ', batch)
        for token in nlp(processed_batch):
            word = token_to_word(token)
            if word is not None:
                yield word

In [108]:
try:
    for word in enumerate_words_version4("shakespeare.txt", batch_size=10000, pbar=True):
        ...
except KeyboardInterrupt:
    print('Interrupted')

  0%|          | 0/544 [00:00<?, ?it/s]

Interrupted


Nice! Now it's 2 minutes. But, luckily, there is a function nlp.pipe, which does practically the same as we do, but also includes multiprocessing out of the box. So, we can pass our processed lines generator in it.

Another optimization is: we do not need Named Entity Recognition (ner), so we can disable that component, thus improving the speed slightly. 

In [83]:
from tqdm.notebook import tqdm

def enumerate_processed_lines(file_path: str, pbar: bool = False) -> Generator[str, None, None]:
    if pbar:
        with open(file_path, 'r') as f:
            line_count = sum(1 for line in f)
    
    with open(file_path, 'r') as f:
        line_iterable = tqdm(f, total=line_count) if pbar else f
        for line in line_iterable:
            processed_line = re.sub(r'[^a-zA-Z]', ' ', line)
            yield processed_line


def enumerate_words(
        file_path: str, 
        batch_size: int = 1000, 
        n_process: int = 4, 
        pbar: bool = False
) -> Generator[Word, None, None]:
    nlp = spacy.load("en_core_web_sm")
    
    for doc in nlp.pipe(enumerate_processed_lines(file_path, pbar), batch_size=batch_size, n_process=n_process, disable=['ner']):
        for token in doc:
            word = token_to_word(token)
            if word is not None:
                yield word

In [111]:
try:
    for word in enumerate_words("shakespeare.txt", pbar=True):
        ...
except KeyboardInterrupt:
    print('Interrupted')

Finally, it's about 1 minute. And we're done with enumerate_words function.

The next step is calculating word statistics, which is pretty straightforward.

In [98]:
total_word_count = 0
unique_words = set()
pos_data = {}
longest_noun_word = ""
total_positivity = 0
total_negativity = 0

for word in enumerate_words("shakespeare.txt", pbar=True):
    total_word_count += 1
    if word.pos_tag not in pos_data:
        pos_data[word.pos_tag] = { 'total': 0, 'unique': set() }
    word_pos_data = pos_data[word.pos_tag]
    word_pos_data['total'] += 1
    if word.lemma is not None:
        unique_words.add(word.lemma)
        word_pos_data['unique'].add(word.lemma)
    if word.pos_tag == 'NOUN' and len(word.content) > len(longest_noun_word):
        longest_noun_word = word.content
    total_positivity += word.positivity
    total_negativity += word.negativity

  0%|          | 0/124185 [00:00<?, ?it/s]

In [99]:
print(f"Total words: {total_word_count}")
print(f"Unique words: {len(unique_words)}")
print("-----------------")
for pos_tag, data in pos_data.items():
    print(f"{spacy.explain(pos_tag)}: Total {data['total']}, Unique {len(data['unique'])}")
print("-----------------")
print(f"Total positivity: {total_positivity}, fraction: {total_positivity / total_word_count:.3f}")
print(f"Total negativity: {total_negativity}, fraction: {total_negativity / total_word_count:.3f}")
print(f"Longest noun ({len(longest_noun_word)} chars): {longest_noun_word}")

Total words: 1255638
Unique words: 22181
-----------------
space: Total 328512, Unique 59
adposition: Total 81663, Unique 170
adjective: Total 54843, Unique 3969
noun: Total 171204, Unique 10053
pronoun: Total 144726, Unique 113
auxiliary: Total 56471, Unique 195
subordinating conjunction: Total 23992, Unique 65
adverb: Total 42796, Unique 1165
proper noun: Total 93483, Unique 7862
verb: Total 120710, Unique 5088
coordinating conjunction: Total 37638, Unique 25
determiner: Total 59433, Unique 43
particle: Total 24256, Unique 7
numeral: Total 4386, Unique 41
other: Total 5574, Unique 121
interjection: Total 5941, Unique 119
symbol: Total 2, Unique 2
punctuation: Total 8, Unique 2
-----------------
Total positivity: 47714.0, fraction: 0.038
Total negativity: 35451.0, fraction: 0.028
Longest noun (16 chars): incomprehensible


# Wrapping Word Analyzer into an Package

Now, let's create a package with CLI interface for our word analyzing function.

## Main file

First, let's modify our code a little bit, to return data class with all the statistics

In [85]:
%%writefile worddata.py
import re
from dataclasses import dataclass, field
from tqdm import tqdm
import spacy
from typing import Set, Generator
from spacy.tokens import Token
from nltk.sentiment import SentimentIntensityAnalyzer


@dataclass(frozen=True)
class Word:
    content: str
    pos_tag: str
    lemma: str | None
    positivity: float
    negativity: float


@dataclass
class POSFrequencyData:
    total: int = 0
    unique: Set[str] = field(default_factory=lambda: set())


@dataclass
class WordData:
    total_word_count: int = 0
    unique_words: Set[str] = field(default_factory=lambda: set())
    pos_frequency_data: dict[str, POSFrequencyData] = field(default_factory=lambda: {})
    longest_noun_word: str = ""
    total_positivity: float = 0
    total_negativity: float = 0

    def __str__(self) -> str:
        result = f"Total words: {self.total_word_count}\n"
        result += f"Unique words: {len(self.unique_words)}\n"
        result += "-----------------\n"
        for pos_tag, data in self.pos_frequency_data.items():
            result += f"{spacy.explain(pos_tag)}: Total {data.total}, Unique {len(data.unique)}\n"
        result += "-----------------\n"
        result += f"Total positivity: {self.total_positivity}, fraction: {self.total_positivity / self.total_word_count:.3f}\n"
        result += f"Total negativity: {self.total_negativity}, fraction: {self.total_negativity / self.total_word_count:.3f}\n"
        result += f"Longest noun ({len(self.longest_noun_word)} chars): {self.longest_noun_word}\n"
        return result


def token_to_word(token: Token, sia: SentimentIntensityAnalyzer) -> Word | None:
    if token.is_punct:
        return None
    polarity_scores = sia.polarity_scores(token.text)
    return Word(
        content=token.text,
        pos_tag=token.pos_,
        lemma=token.lemma_ if not token.is_punct else None,
        positivity=polarity_scores['pos'],
        negativity=polarity_scores['neg']
    )


def enumerate_processed_lines(file_path: str, pbar: bool = False) -> Generator[str, None, None]:
    if pbar:
        with open(file_path, 'r') as f:
            line_count = sum(1 for line in f)

    with open(file_path, 'r') as f:
        line_iterable = tqdm(f, total=line_count) if pbar else f
        for line in line_iterable:
            processed_line = re.sub(r'[^a-zA-Z]', ' ', line)
            yield processed_line


def enumerate_words(
        file_path: str,
        batch_size: int = 1000,
        n_process: int = 4,
        pbar: bool = False
) -> Generator[Word, None, None]:
    nlp = spacy.load("en_core_web_sm")
    sia = SentimentIntensityAnalyzer()

    for doc in nlp.pipe(enumerate_processed_lines(file_path, pbar), batch_size=batch_size, n_process=n_process, disable=['ner']):
        for token in doc:
            word = token_to_word(token, sia)
            if word is not None:
                yield word


def get_word_data(
        file_path: str,
        batch_size: int = 1000,
        n_process: int = 4,
        pbar: bool = False
) -> WordData:
    result = WordData()

    for word in enumerate_words(file_path, batch_size=batch_size, n_process=n_process, pbar=pbar):
        result.total_word_count += 1
        if word.pos_tag not in result.pos_frequency_data:
            result.pos_frequency_data[word.pos_tag] = POSFrequencyData()
        word_pos_data = result.pos_frequency_data[word.pos_tag]
        word_pos_data.total += 1
        if word.lemma is not None:
            result.unique_words.add(word.lemma)
            word_pos_data.unique.add(word.lemma)
        if word.pos_tag == 'NOUN' and len(word.content) > len(result.longest_noun_word):
            result.longest_noun_word = word.content
        result.total_positivity += word.positivity
        result.total_negativity += word.negativity

    return result


Overwriting worddata.py


## CLI

We're gonna write 2 versions of CLI interface: using argparser and using click, which are 2 most famous python CLI libraries.

### Argparser example

In [86]:
%%writefile cli_argparser.py
import argparse
import os

from worddata import get_word_data

def main():
    parser = argparse.ArgumentParser(description="Word analyzer")

    parser.add_argument(
        "file_path",
        type=str,
        help="Path to input file"
    )

    # Optional arguments with defaults
    parser.add_argument(
        "--batch-size",
        type=int,
        default=1000,
        help="Number of items per batch"
    )

    parser.add_argument(
        "--n-process",
        type=int,
        default=4,
        help="Number of parallel processes"
    )

    parser.add_argument(
        "--no-pbar",
        action="store_false",
        dest="pbar",
        help="Disable progress bar"
    )

    args = parser.parse_args()

    if not os.path.exists(args.file_path):
        parser.error(f"File not found: {args.file_path}")

    word_data = get_word_data(
        file_path=args.file_path,
        batch_size=args.batch_size,
        n_process=args.n_process,
        pbar=args.pbar
    )

    print(word_data)

if __name__ == "__main__":
    main()

Overwriting cli_argparser.py


### Click example

In [87]:
%%writefile cli_click.py
import click
import os
from worddata import get_word_data


@click.command()
@click.argument(
    "file_path",
    type=click.Path(exists=True, dir_okay=False, readable=True),
    required=True
)
@click.option(
    "--batch-size",
    type=int,
    default=1000,
    show_default=True,
    help="Number of items per batch"
)
@click.option(
    "--n-process",
    type=int,
    default=4,
    show_default=True,
    help="Number of parallel processes"
)
@click.option(
    "--no-pbar",
    is_flag=True,
    default=False,
    help="Disable progress bar"
)
def main(file_path: str, batch_size: int, n_process: int, no_pbar: bool):
    click.echo("Start processing")
    word_data = get_word_data(
        file_path=file_path,
        batch_size=batch_size,
        n_process=n_process,
        pbar=not no_pbar
    )

    click.echo("Word Analysis Results:")
    click.echo(click.style(str(word_data), fg='green'))


if __name__ == "__main__":
    main()


Overwriting cli_click.py


## Wrapping it all into a package

*This will be shown externally in word_analyzer directory*

## Installing our package into another environment

*Will be shown in another notebook*