Skip to content
This repository has been archived by the owner on Jun 14, 2018. It is now read-only.

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
msarahan committed Sep 28, 2015
1 parent 924bd9b commit 9d2b9cb
Show file tree
Hide file tree
Showing 9 changed files with 18 additions and 35 deletions.
6 changes: 0 additions & 6 deletions topik/importers.py

This file was deleted.

4 changes: 1 addition & 3 deletions topik/intermediaries/digested_document_collection.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from itertools import tee

from gensim.interfaces import CorpusABC
from gensim.corpora.dictionary import Dictionary
from gensim.interfaces import CorpusABC

from .raw_data import load_persisted_corpus

Expand Down
4 changes: 2 additions & 2 deletions topik/intermediaries/raw_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
Elasticsearch. The class(es) defined here are fed into the preprocessing step.
"""

from abc import ABCMeta, abstractmethod, abstractproperty
import logging
import time
from abc import ABCMeta, abstractmethod, abstractproperty

from six import with_metaclass
from elasticsearch import Elasticsearch, helpers
from six import with_metaclass

from topik.intermediaries.persistence import Persistor

Expand Down
Empty file removed topik/tests/test_intermediaries.py
Empty file.
5 changes: 1 addition & 4 deletions topik/tests/test_models.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os
import unittest
from abc import ABCMeta, abstractmethod

from six import with_metaclass

from topik.readers import read_input
from topik.preprocessing import preprocess
Expand All @@ -16,7 +13,7 @@
MODEL_SAVE_FILENAME = os.path.join(module_path, 'test.model')


class _ModelBase(with_metaclass(ABCMeta)):
class _ModelBase(object):
def setUp(self):
raw_data = read_input(
source=os.path.join(module_path, 'data/test_data_json_stream.json'),
Expand Down
1 change: 0 additions & 1 deletion topik/tests/test_readers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import unittest
from functools import partial

import nose.tools as nt
import elasticsearch
Expand Down
26 changes: 10 additions & 16 deletions topik/tests/test_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
import unittest

from topik.readers import read_input
from topik.intermediaries.raw_data import ElasticSearchCorpus, _get_hash_identifier
from topik.tokenizers import tokenizer_methods, find_entities, collect_bigrams_and_trigrams

# sample data files are located in the same folder
module_path = os.path.dirname(__file__)


class TestTokenizers(unittest.TestCase):
def setUp(self):
self.solution_simple_tokenizer_test_data_1 = [
Expand Down Expand Up @@ -81,10 +81,8 @@ def setUp(self):
u'properties', u'sol', u'gel', u'method', u'dna', u'easy',
u'method', u'biomedical', u'applications']

self.data_json_stream_path = os.path.join(module_path,
'data/test_data_json_stream.json')
self.data_large_json_path = os.path.join(module_path,
'data/test_data_large_json.json')
self.data_json_stream_path = os.path.join(module_path, 'data/test_data_json_stream.json')
self.data_large_json_path = os.path.join(module_path, 'data/test_data_large_json.json')
assert os.path.exists(self.data_json_stream_path)
assert os.path.exists(self.data_large_json_path)

Expand All @@ -93,10 +91,9 @@ def test_simple_tokenizer(self):
source=self.data_json_stream_path,
content_field="abstract",
output_type="dictionary")
id, text = next(iter(raw_data))
_, text = next(iter(raw_data))
doc_tokens = tokenizer_methods["simple"](text)
self.assertEqual(doc_tokens,
self.solution_simple_tokenizer_test_data_json_stream)
self.assertEqual(doc_tokens, self.solution_simple_tokenizer_test_data_json_stream)

def test_collocations_tokenizer(self):
raw_data = read_input(
Expand All @@ -106,21 +103,19 @@ def test_collocations_tokenizer(self):
bigrams, trigrams = collect_bigrams_and_trigrams(raw_data,
min_bigram_freq=2,
min_trigram_freq=2)
id, text = next(iter(raw_data))
_, text = next(iter(raw_data))
doc_tokens = tokenizer_methods["collocation"](text, bigrams, trigrams)
self.assertEqual(doc_tokens,
self.solution_collocations_tokenizer_test_data_json_stream)
self.assertEqual(doc_tokens, self.solution_collocations_tokenizer_test_data_json_stream)

def test_entities_tokenizer_json_stream(self):
raw_data = read_input(
source=self.data_json_stream_path,
content_field="abstract",
output_type="dictionary")
entities = find_entities(raw_data, freq_min=1)
id, text = next(iter(raw_data))
_, text = next(iter(raw_data))
doc_tokens = tokenizer_methods["entities"](text, entities)
self.assertEqual(doc_tokens,
self.solution_entities_tokenizer_test_data_json_stream)
self.assertEqual(doc_tokens, self.solution_entities_tokenizer_test_data_json_stream)

def test_mixed_tokenizer(self):
raw_data = read_input(
Expand All @@ -130,8 +125,7 @@ def test_mixed_tokenizer(self):
entities = find_entities(raw_data)
id, text = next(iter(raw_data))
doc_tokens = tokenizer_methods["mixed"](text, entities)
self.assertEqual(doc_tokens,
self.solution_mixed_tokenizer_test_data_json_stream)
self.assertEqual(doc_tokens, self.solution_mixed_tokenizer_test_data_json_stream)


if __name__ == '__main__':
Expand Down
1 change: 1 addition & 0 deletions topik/tests/test_viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

module_path = os.path.dirname(__file__)


class TestTokenizers(unittest.TestCase):

def test_termite(self):
Expand Down
6 changes: 3 additions & 3 deletions topik/tokenizers.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from __future__ import absolute_import, print_function

import logging
import itertools
import logging
import re

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from textblob import TextBlob
import gensim
from gensim.parsing.preprocessing import STOPWORDS

# imports used only for doctests
from topik.tests import test_data_path
Expand Down

0 comments on commit 9d2b9cb

Please sign in to comment.