Skip to content

Commit

Permalink
Merge pull request #2 from AndreaSottana/improve_test_coverage
Browse files Browse the repository at this point in the history
improve test coverage
  • Loading branch information
AndreaSottana committed Jun 29, 2020
2 parents e523e78 + 5278d6e commit 44e5fa6
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 0 deletions.
35 changes: 35 additions & 0 deletions tests/test_process_user_queries.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os
import pytest
import logging
import numpy as np
import pandas as pd
import pandas.util.testing as pdt
from pdf2emb_nlp.process_user_queries import query_embeddings


Expand Down Expand Up @@ -43,6 +46,38 @@ def test_query_embeddings_with_word2vec_tfidf_weighted_with_exact_query():
np.testing.assert_array_equal(embedding, trained_df['Word2Vec_with_TfIdf_weights'][0])


def test_query_embeddings_with_word2vec_raises_logger_error_when_all_words_out_of_vocabulary(caplog):
with caplog.at_level(logging.ERROR):
embedding, trained_df = query_embeddings(
"Hello there how are you?",
os.path.join(os.getenv('FIXTURES_DIR'), 'full_df_with_embeddings.parquet.gzip'),
'Word2Vec',
'Word2Vec',
os.path.join(os.getenv('FIXTURES_DIR'), 'word2vec.pickle')
)
expected_log_message = \
'None of the words inputted are in the Word2Vec vocabulary. Please change your input or try a different ' \
'model, such as ELMo or BERT. Returning empty array and DataFrame.'
assert expected_log_message in caplog.text
np.testing.assert_array_equal(embedding, np.array([]))
pdt.assert_frame_equal(trained_df, pd.DataFrame())


def test_query_embeddings_with_word2vec_raises_logger_warning_when_some_words_out_of_vocabulary(caplog):
with caplog.at_level(logging.WARNING):
query_embeddings(
"Hello Michael, this is a trial sentence!",
os.path.join(os.getenv('FIXTURES_DIR'), 'full_df_with_embeddings.parquet.gzip'),
'Word2Vec',
'Word2Vec',
os.path.join(os.getenv('FIXTURES_DIR'), 'word2vec.pickle')
)
expected_log_message = \
"The following words are not in the trained vocabulary and were therefore excluded from the search: " \
"['Hello', 'trial', '!']"
assert expected_log_message in caplog.text


def test_query_embeddings_with_elmo_with_exact_query():
embedding, trained_df = query_embeddings(
"Michael went to the store to buy some eggs .",
Expand Down
10 changes: 10 additions & 0 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import logging
import numpy as np
import pandas as pd
from pdf2emb_nlp.scraper import DocumentScraper
Expand All @@ -12,6 +13,15 @@ def test_class_instantiation(self, text_cleaning_json):
assert scraper.pdf_folder == os.getenv("FIXTURES_DIR")
assert scraper.open_json == text_cleaning_json

def test_class_instantiation_when_no_text_cleaning_json_provided(self, caplog):
with caplog.at_level(logging.WARNING):
scraper = DocumentScraper(os.getenv("FIXTURES_DIR"))
assert scraper.pdf_folder == os.getenv("FIXTURES_DIR")
expected_log_message = \
'No .json file for text cleaning was provided. Ad-hoc text cleaning will not be performed.'
assert expected_log_message in caplog.text
assert scraper.open_json == dict()

def test_document_corpus_to_pandas_df(self):
expected_scraped_df = pd.DataFrame(
{'test_pdf_1': [
Expand Down

0 comments on commit 44e5fa6

Please sign in to comment.