# WSJ data - One domain

In [7]:
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
data_dir = "/content/drive/MyDrive/Colab Notebooks/Capstone/data/gweb_sancl"
wsj_dir = os.path.join(data_dir, "pos_fine", "wsj")
labeled_dir = os.path.join(data_dir, "unlabeled")

In [9]:
import codecs

In [10]:
def read_conll_file(file_name, raw=False):
    """
    read in conll file
    word1    tag1
    ...      ...
    wordN    tagN
    Sentences MUST be separated by newlines!
    :param file_name: file to read in
    :param raw: if raw text file (with one sentence per line) -- adds 'DUMMY' label
    :return: generator of instances ((list of  words, list of tags) pairs)
    """
    current_words = []
    current_tags = []
    
    for line in codecs.open(file_name, encoding='utf-8'):
        #line = line.strip()
        line = line[:-1]

        if line:
            if raw:
                current_words = line.split() ## simple splitting by space
                current_tags = ['DUMMY' for _ in current_words]
                yield (current_words, current_tags)

            else:
                if len(line.split("\t")) != 2:
                    if len(line.split("\t")) == 1: # emtpy words in gimpel
                        raise IOError("Issue with input file - doesn't have a tag or token?")
                    else:
                        print("erroneous line: {} (line number: {}) ".format(line), file=sys.stderr)
                        exit()
                else:
                    word, tag = line.split('\t')
                current_words.append(word)
                current_tags.append(tag)

        else:
            if current_words and not raw: #skip emtpy lines
                yield (current_words, current_tags)
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != [] and not raw:
        yield (current_words, current_tags)

In [14]:
wsj_train_file = os.path.join(wsj_dir, "gweb-wsj-train.conll")
wsj_dev_file = os.path.join(wsj_dir, "gweb-wsj-dev.conll")
wsj_test_file = os.path.join(wsj_dir, "gweb-wsj-test.conll")

In [15]:
wsj_train_word_lst = []
wsj_train_tag_lst = []
wsj_tags = []
for word, tag in read_conll_file(wsj_train_file):
  wsj_train_word_lst.append(word)
  wsj_train_tag_lst.append(tag)
  wsj_tags.extend(tag)
print("The number of sentences in wsj train", len(wsj_train_word_lst))

wsj_dev_word_lst = []
wsj_dev_tag_lst = []
for word, tag in read_conll_file(wsj_dev_file):
  wsj_dev_word_lst.append(word)
  wsj_dev_tag_lst.append(tag)
  wsj_tags.extend(tag)
print("The number of sentences in wsj dev", len(wsj_dev_word_lst))

wsj_test_word_lst = []
wsj_test_tag_lst = []
for word, tag in read_conll_file(wsj_test_file):
  wsj_test_word_lst.append(word)
  wsj_test_tag_lst.append(tag)
  wsj_tags.extend(tag)
print("The number of sentences in wsj test", len(wsj_test_word_lst))
print("The number of tags in wsj", len(set(wsj_tags)))

The number of sentences in wsj train 30060
The number of sentences in wsj dev 1336
The number of tags in wsj 48
The number of sentences in wsj test 1640
The number of tags in wsj 48


In [None]:
file_name_lst = ["answers", "emails", "newsgroups", "reviews", "weblogs"]

In [6]:
for f_name in file_name_lst:
  print("\n")
  print("domain:", f_name)
  domain_dir = os.path.join(data_dir, "pos_fine", f"{f_name}")
  answer_dev_file = os.path.join(domain_dir, f"gweb-{f_name}-dev.conll")
  answer_test_file = os.path.join(domain_dir, f"gweb-{f_name}-test.conll")

  ans_dev_word_lst = []
  ans_dev_tag_lst = []
  ans_tags = []
  for word, tag in read_conll_file(answer_dev_file):
    ans_dev_word_lst.append(word)
    ans_dev_tag_lst.append(tag)
    ans_tags.extend(tag)
  print("The number of sentences in answer dev", len(ans_dev_word_lst))

  ans_test_word_lst = []
  ans_test_tag_lst = []
  for word, tag in read_conll_file(answer_test_file):
    ans_test_word_lst.append(word)
    ans_test_tag_lst.append(tag)
    ans_tags.extend(tag)
  print("The number of sentences in answer test", len(ans_test_word_lst))
  print("The number of tags in ans", len(set(ans_tags)))



domain: wsj
The number of sentences in answer dev 1336
The number of sentences in answer test 1640
The number of tags in ans 45


domain: answers
The number of sentences in answer dev 1745
The number of sentences in answer test 1744
The number of tags in ans 50


domain: emails
The number of sentences in answer dev 2450
The number of sentences in answer test 2450
The number of tags in ans 49


domain: newsgroups
The number of sentences in answer dev 1196
The number of sentences in answer test 1195
The number of tags in ans 49


domain: reviews
The number of sentences in answer dev 1907
The number of sentences in answer test 1906
The number of tags in ans 50


domain: weblogs
The number of sentences in answer dev 1016
The number of sentences in answer test 1015
The number of tags in ans 49


# Penn POS data

In [None]:
# import torchtext
# from torchtext.legacy import data
# from torchtext.legacy import datasets

In [None]:
# TEXT = data.Field(lower = True)
# UD_TAGS = data.Field(unk_token = None)
# PTB_TAGS = data.Field(unk_token = None)

# fields = (("text", TEXT), ("udtags", UD_TAGS), ("ptbtags", PTB_TAGS))
# train_data, valid_data, test_data = datasets.UDPOS.splits(fields)

# print(f"Number of training examples: {len(train_data)}")
# print(f"Number of validation examples: {len(valid_data)}")
# print(f"Number of testing examples: {len(test_data)}")

# print(vars(train_data.examples[0]))

In [None]:
!pip install git+https://github.com/PetrochukM/PyTorch-NLP.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/PetrochukM/PyTorch-NLP.git
  Cloning https://github.com/PetrochukM/PyTorch-NLP.git to /tmp/pip-req-build-s2k3gvkx
  Running command git clone -q https://github.com/PetrochukM/PyTorch-NLP.git /tmp/pip-req-build-s2k3gvkx
Building wheels for collected packages: pytorch-nlp
  Building wheel for pytorch-nlp (setup.py) ... [?25l[?25hdone
  Created wheel for pytorch-nlp: filename=pytorch_nlp-0.5.0-py3-none-any.whl size=88361 sha256=0910dee3e1b4159f65e322fc0b39ac435976346a7ec5195303fa2379e4de95c4
  Stored in directory: /tmp/pip-ephem-wheel-cache-gprk44_g/wheels/84/78/c4/66b0b0a3f3973609c6fdd26a91411257c13314b2445c7d83fc
Successfully built pytorch-nlp
Installing collected packages: pytorch-nlp
Successfully installed pytorch-nlp-0.5.0


In [None]:
# https://pytorchnlp.readthedocs.io/en/latest/_modules/torchnlp/datasets/ud_pos.html

import os
import io

from torchnlp.download import download_file_maybe_extract


def ud_pos_dataset(directory='data/',
                   train=False,
                   dev=False,
                   test=False,
                   train_filename='en-ud-tag.v2.train.txt',
                   dev_filename='en-ud-tag.v2.dev.txt',
                   test_filename='en-ud-tag.v2.test.txt',
                   extracted_name='en-ud-v2',
                   check_files=['en-ud-v2/en-ud-tag.v2.train.txt'],
                   url='https://bitbucket.org/sivareddyg/public/downloads/en-ud-v2.zip'):
    """
    Load the Universal Dependencies - English Dependency Treebank dataset.

    Corpus of sentences annotated using Universal Dependencies annotation. The corpus comprises
    254,830 words and 16,622 sentences, taken from various web media including weblogs, newsgroups,
    emails, reviews, and Yahoo! answers.

    References:
        * http://universaldependencies.org/
        * https://github.com/UniversalDependencies/UD_English

    **Citation:**
    Natalia Silveira and Timothy Dozat and Marie-Catherine de Marneffe and Samuel Bowman and
    Miriam Connor and John Bauer and Christopher D. Manning (2014).
    A Gold Standard Dependency Corpus for {E}nglish

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the development split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the development split.
        test_filename (str, optional): The filename of the test split.
        extracted_name (str, optional): Name of the extracted dataset directory.
        check_files (str, optional): Check if these files exist, then this download was successful.
        url (str, optional): URL of the dataset `tar.gz` file.

    Returns:
        :class:`tuple` of :class:`iterable` or :class:`iterable`:
        Returns between one and all dataset splits (train, dev and test) depending on if their
        respective boolean argument is ``True``.

    Example:
        >>> from torchnlp.datasets import ud_pos_dataset  # doctest: +SKIP
        >>> train = ud_pos_dataset(train=True)  # doctest: +SKIP
        >>> train[17]  # doctest: +SKIP
        {
          'tokens': ['Guerrillas', 'killed', 'an', 'engineer', ',', 'Asi', 'Ali', ',', 'from',
                     'Tikrit', '.'],
          'ud_tags': ['NOUN', 'VERB', 'DET', 'NOUN', 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', 'ADP',
                      'PROPN', 'PUNCT'],
          'ptb_tags': ['NNS', 'VBD', 'DT', 'NN', ',', 'NNP', 'NNP', ',', 'IN', 'NNP', '.']
        }
    """
    download_file_maybe_extract(url=url, directory=directory, check_files=check_files)

    ret = []
    splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, extracted_name, filename)
        examples = []
        with io.open(full_path, encoding='utf-8') as f:
            sentence = {'tokens': [], 'ud_tags': [], 'ptb_tags': []}
            for line in f:
                line = line.strip()
                if line == '' and len(sentence['tokens']) > 0:
                    examples.append(sentence)
                    sentence = {'tokens': [], 'ud_tags': [], 'ptb_tags': []}
                elif line != '':
                    token, ud_tag, ptb_tag = tuple(line.split('\t'))
                    sentence['tokens'].append(token)
                    sentence['ud_tags'].append(ud_tag)
                    sentence['ptb_tags'].append(ptb_tag)
        ret.append(examples)

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)

In [None]:
penn_train, penn_dev, penn_test = res = ud_pos_dataset(train=True,
                                                              dev=True,
                                                              test=True)
print("Train size", len(penn_train))
print("Dev size", len(penn_dev))
print("Test size", len(penn_test))

en-ud-v2.zip: 696kB [00:00, 2.95MB/s]                            


In [None]:
penn_train[1]

{'tokens': ['[',
  'This',
  'killing',
  'of',
  'a',
  'respected',
  'cleric',
  'will',
  'be',
  'causing',
  'us',
  'trouble',
  'for',
  'years',
  'to',
  'come',
  '.',
  ']'],
 'ud_tags': ['PUNCT',
  'DET',
  'NOUN',
  'ADP',
  'DET',
  'ADJ',
  'NOUN',
  'AUX',
  'AUX',
  'VERB',
  'PRON',
  'NOUN',
  'ADP',
  'NOUN',
  'PART',
  'VERB',
  'PUNCT',
  'PUNCT'],
 'ptb_tags': ['-LRB-',
  'DT',
  'NN',
  'IN',
  'DT',
  'JJ',
  'NN',
  'MD',
  'VB',
  'VBG',
  'PRP',
  'NN',
  'IN',
  'NNS',
  'TO',
  'VB',
  '.',
  '-RRB-']}