# Unit testing of LBD_02_data_preprocessing.ipynb with pytest and ipytest

Unit testing is a good practice in software development with the pupose to improve the reliability and quality of code by verifying that individual components (units) of a program code work as expected. It enables the code to be more maintainable, scalable, and robust.

In this notebook we use `pytest` library, which is a popular testing framework for Python that is best known for its simplicity and flexibility. `ipytest` is an extension of `pytest` designed specifically for use in Jupyter notebooks. It enables running `pytest` tests directly within a notebook, making it ideal for environments where data exploration, interactive analysis, and incremental development are common.

In [1]:
import logging

# Initialize logging with a basic configuration
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s: %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')

In [2]:
import import_ipynb
import LBD_01_data_acquisition
import LBD_02_data_preprocessing

importing Jupyter notebook from LBD_01_data_acquisition.ipynb
importing Jupyter notebook from LBD_02_data_preprocessing.ipynb


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bojan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bojan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
import nltk
import numpy as np
import itertools
import pandas as pd
# import pickle
# import json
import spacy
from typing import List, Dict

In [4]:
import ipytest
ipytest.autoconfig()

In [19]:
# Cell 2: Write a test
def test_addition():
    assert 1 + 1 == 2

In [20]:
# Cell 3: Run tests
ipytest.run('-vv')

# https://medium.com/@mefengl/using-pytest-in-jupyter-notebooks-a-practical-guide-1ba8e02af288

platform win32 -- Python 3.7.9, pytest-7.4.4, pluggy-1.2.0 -- c:\Work\LBD_book\venv\Scripts\python.exe
cachedir: .pytest_cache
rootdir: c:\Work\LBD_book\notebooks
[1mcollecting ... [0m

collected 6 items

t_4429ed46aa7242d5bc45bd4945dfb965.py::test_parametrized[0-0] [32mPASSED[0m[32m                         [ 16%][0m
t_4429ed46aa7242d5bc45bd4945dfb965.py::test_parametrized[1-0] [32mPASSED[0m[32m                         [ 33%][0m
t_4429ed46aa7242d5bc45bd4945dfb965.py::test_parametrized[2-2] [32mPASSED[0m[32m                         [ 50%][0m
t_4429ed46aa7242d5bc45bd4945dfb965.py::test_parametrized[3-2] [32mPASSED[0m[32m                         [ 66%][0m
t_4429ed46aa7242d5bc45bd4945dfb965.py::test_fixture [32mPASSED[0m[32m                                   [ 83%][0m
t_4429ed46aa7242d5bc45bd4945dfb965.py::test_addition [32mPASSED[0m[32m                                  [100%][0m



<ExitCode.OK: 0>

In [21]:
def my_func(x):
    return x // 2 * 2 

In [22]:
%%ipytest
#To execute test, just decorate the cells containing tests with the %%ipytest magic:

# define the tests

def test_my_func():
    assert my_func(0) == 0
    assert my_func(1) == 0
    assert my_func(2) == 2
    assert my_func(3) == 2

[32m.[0m[32m                                                                                            [100%][0m
[32m[32m[1m1 passed[0m[32m in 0.01s[0m[0m


In [23]:
#To execute tests without IPython magics use the ipytest.run function
ipytest.run()

[32m.[0m[32m                                                                                            [100%][0m
[32m[32m[1m1 passed[0m[32m in 0.01s[0m[0m


<ExitCode.OK: 0>

In [24]:
%%ipytest
#Using pytest fixtures - Common pytest features, such as fixtures and parametrize, are supported out of the box:

import pytest

@pytest.mark.parametrize('input,expected', [
    (0, 0),
    (1, 0),
    (2, 2),
    (3, 2),
])

def test_parametrized(input, expected):
    assert my_func(input) == expected

@pytest.fixture
def my_fixture():
    return 42
    
def test_fixture(my_fixture):
    assert my_fixture == 42   

[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                                                                        [100%][0m
[32m[32m[1m5 passed[0m[32m in 0.02s[0m[0m


In [6]:
# Test suite for the function LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list)

def test_do_clean_text_empty_corpus():
    # Test with an empty corpus
    corpus = []
    keep_list = []
    remove_list = []
    expected = []
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_empty_strings_in_corpus():
    # Test with a corpus containing empty strings
    corpus = ["", "   "]
    keep_list = []
    remove_list = []
    expected = ["", ""]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_basic_cleaning():
    # Test basic cleaning without keep_list or remove_list
    corpus = ["This   is, a   sentence! And     another;    one."]
    keep_list = []
    remove_list = []
    expected = ["this is sentence and another one"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_basic_cleaning_foreign_language():
    # Test basic cleaning without keep_list or remove_list
    corpus = ["Tole je, stavek z ločili! Še eden stavek; in ponovno nov stavek. Posebni znaki čšž ČŠŽ."]
    keep_list = []
    remove_list = []
    expected = ["tole je stavek ločili še eden stavek in ponovno nov stavek posebni znaki čšž čšž"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_keep_list():
    # Test with a keep_list
    corpus = ["This is, a sentence! I keep a book and remove the desk."]
    keep_list = ["i", "a"]
    remove_list = []
    expected = ["this is a sentence i keep a book and remove the desk"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_remove_list():
    # Test with a remove_list
    corpus = ["This is, a sentence! Keep a book and remove the desk."]
    keep_list = []
    remove_list = ["book", "the", "desk"]
    expected = ["this is sentence keep and remove"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_keep_and_remove_list():
    # Test with both keep_list and remove_list
    corpus = ["This is, a sentence! I keep a book and remove the desk."]
    keep_list = ["i"]
    remove_list = ["book", "desk"]
    expected = ["this is sentence i keep and remove the"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_numeric_values():
    # Test with numeric values in the corpus
    corpus = ["Hello123 123 world456 456, I would like to 789keep 789 this."]
    keep_list = ["789"]
    remove_list = []
    expected = ["hello123 world456 would like to 789keep 789 this"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_short_words():
    # Test with short words that should be removed
    corpus = ["I saw a quick brown fox."]
    keep_list = ["i"]
    remove_list = ["brown"]
    expected = ["i saw quick fox"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_mixed_case():
    # Test with mixed case words
    corpus = ["Python IS awesome!"]
    keep_list = ["python"]
    remove_list = ["awesome"]
    expected = ["python is"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

# ipytest.config(rewrite_asserts=True)

ipytest.run()

[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                                                                   [100%][0m
[32m[32m[1m10 passed[0m[32m in 0.02s[0m[0m


<ExitCode.OK: 0>

In [24]:
from nltk.corpus import stopwords
wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
stop = set(stopwords.words('english'))
for word in wh_words:
    stop.remove(word)
print("The stow word list: ", stop)

The stow word list:  {'s', 'yourself', 'me', 'hadn', 'once', 'between', 'some', 'at', 'be', 'of', 'didn', 'by', 've', 'your', 'any', "hasn't", 'been', 'wouldn', 'did', "needn't", "you're", 'an', 'a', 'there', 'is', "shouldn't", 'doesn', 'most', 'down', 'before', 'against', 'his', 'each', 'being', 'itself', 'only', 'he', 'after', "you'd", 'those', 'on', 'has', "isn't", 'our', 'mightn', 'them', 'same', 'while', 'they', 't', 'through', 'above', 'but', 'below', 'as', 'ma', 're', "shan't", 'themselves', 'just', 'won', "haven't", "don't", 'were', 'ourselves', 'few', 'my', "you'll", 'until', 'here', 'd', 'do', 'should', 'to', "that'll", 'with', 'or', 'all', 'haven', "she's", 'it', 'mustn', 'have', 'more', 'no', 'himself', 'nor', "you've", "couldn't", 'shan', "mustn't", 'having', 'we', 'had', 'weren', 'ain', 'her', "it's", 'shouldn', 'yourselves', 'such', 'you', 'further', "weren't", 'too', 'don', 'doing', 'can', 'theirs', "doesn't", "won't", "wasn't", "wouldn't", 'was', 'o', 'herself', "aren'

In [25]:
# Test suite for the function LBD_02_data_preprocessing.do_remove_stopwords(corpus)
def test_do_remove_stopwords_empty_corpus():
    # Test with an empty corpus
    corpus = LBD_02_data_preprocessing.do_clean_text([], [], [])
    expected = []
    assert LBD_02_data_preprocessing.do_remove_stopwords(corpus) == expected

def test_do_remove_stopwords_no_stopwords_in_corpus():
    # Test when the corpus has no stopwords
    corpus = LBD_02_data_preprocessing.do_clean_text(["Python programming", "Machine Learning"], [], [])
    expected = [["python", "programming"], ["machine", "learning"]]
    assert LBD_02_data_preprocessing.do_remove_stopwords(corpus) == expected

def test_do_remove_stopwords_all_stopwords_in_corpus():
    # Test when the corpus is filled with stopwords
    corpus = LBD_02_data_preprocessing.do_clean_text(["the a in on", "and but if or"], [], [])
    expected = [[], []]
    assert LBD_02_data_preprocessing.do_remove_stopwords(corpus) == expected

def test_do_remove_stopwords_mixed_corpus_with_stopwords():
    # Test with a mixed corpus containing stopwords and non-stopwords
    corpus = LBD_02_data_preprocessing.do_clean_text(["This is a test", "I am learning Python"], ["i"], [])
    expected = [["test"], ["learning", "python"]]
    assert LBD_02_data_preprocessing.do_remove_stopwords(corpus) == expected

def test_do_remove_stopwords_wh_words_preserved():
    # Test to ensure wh-words are preserved
    corpus = LBD_02_data_preprocessing.do_clean_text(["Who are you", "Why is this happening", "Where is Python used"], [], [])
    expected = [["who"], ["why", "happening"], ["where", "python", "used"]]
    assert LBD_02_data_preprocessing.do_remove_stopwords(corpus) == expected

def test_do_remove_stopwords_case_insensitivity():
    # Test to ensure the function handles mixed case correctly
    corpus = LBD_02_data_preprocessing.do_clean_text(["This is A Test", "Who Knows Why"], [], [])
    expected = [["test"], ["who", "knows", "why"]]
    assert LBD_02_data_preprocessing.do_remove_stopwords(corpus) == expected

def test_do_remove_stopwords_numbers_and_symbols():
    # Test to ensure numbers and symbols are not considered stopwords
    corpus = LBD_02_data_preprocessing.do_clean_text(["100 dollars", "Python is #1"], [], [])
    expected = [["dollars"], ["python"]]
    assert LBD_02_data_preprocessing.do_remove_stopwords(corpus) == expected

ipytest.run()


[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                                                             [100%][0m
[32m[32m[1m16 passed[0m[32m in 0.03s[0m[0m


<ExitCode.OK: 0>