# Python's collections module

In [None]:
# Documentation: https://docs.python.org/2/library/collections.html
# Per documentation, "this module implements specialized container datatypes\
# providing alternatives to Python’s general purpose built-in containers, dict, list, set, and tuple".

In [48]:
# namedtuple(): factory function for creating tuple subclasses with named fields
# Named tuples assign a name to each position in a tuple, thus enabling accessing
# fields by name instead of position index.
#-----------------------------------------------
# namedtuple(typename, field_names[, verbose=False][, rename=False])
# Returns a new tuple subclass named typename. 
# The new subclass is used to create tuple-like objects that have fields accessible 
# by attribute lookup as well as being indexable and iterable. 

from collections import namedtuple
# We create a named tuple with two fields, tags and words.
# tags will be a string
# words will be a list of words
DataDoc= namedtuple('DataDoc', 'tag words')
# we create a list and each item in the list will be a namedtuple element with the two fields "tags" and "words"
my_data=[]
# We have a list of document. Each document has a single sentence. 
# The first word in each sentence/document is a tag from the set {POSITIVE, NEGATIVE}, so a sentiment analysis task. 
documents = ["POSITIVE I love pizza", "POSITIVE I like Apple", "POSITIVE I enjoy hiking",\
             "POSITIVE I am passionate about traveling", "POSITIVE We had fun writing this code",\
            "NEGATIVE I don't like to stay up late", "NEGATIVE I am tired", "NEGATIVE He feels sick"]

# Now we loop over the documents and populate the list of allsent, which is basically our container for the 
# instances and their labels. From each document/sentence, we get the tag and the list of words
for line_no, doc in enumerate(documents):
    label=doc.split()[0]
    word_list=doc.lower().split()[1:]
    my_data.append(DataDoc(label, word_list))
    print my_data[line_no]


DataDoc(tag='POSITIVE', words=['i', 'love', 'pizza'])
DataDoc(tag='POSITIVE', words=['i', 'like', 'apple'])
DataDoc(tag='POSITIVE', words=['i', 'enjoy', 'hiking'])
DataDoc(tag='POSITIVE', words=['i', 'am', 'passionate', 'about', 'traveling'])
DataDoc(tag='POSITIVE', words=['we', 'had', 'fun', 'writing', 'this', 'code'])
DataDoc(tag='NEGATIVE', words=['i', "don't", 'like', 'to', 'stay', 'up', 'late'])
DataDoc(tag='NEGATIVE', words=['i', 'am', 'tired'])
DataDoc(tag='NEGATIVE', words=['he', 'feels', 'sick'])


In [49]:
print my_data

[DataDoc(tag='POSITIVE', words=['i', 'love', 'pizza']), DataDoc(tag='POSITIVE', words=['i', 'like', 'apple']), DataDoc(tag='POSITIVE', words=['i', 'enjoy', 'hiking']), DataDoc(tag='POSITIVE', words=['i', 'am', 'passionate', 'about', 'traveling']), DataDoc(tag='POSITIVE', words=['we', 'had', 'fun', 'writing', 'this', 'code']), DataDoc(tag='NEGATIVE', words=['i', "don't", 'like', 'to', 'stay', 'up', 'late']), DataDoc(tag='NEGATIVE', words=['i', 'am', 'tired']), DataDoc(tag='NEGATIVE', words=['he', 'feels', 'sick'])]


In [50]:
# Now you can access the tag of each instance
print my_data[0].tag

POSITIVE


In [52]:
# You can also access the instance word list itself
print my_data[0].words

['i', 'love', 'pizza']
