In [1]:
%matplotlib inline
import libs.resources as r
import libs.pipeline as pipe
import libs.files as fh
import numpy as np
import cPickle

In [2]:
# read the information
OUTPUT_DIR = '../3-Output/'
MODEL_OUT_DIR = '../6-Models'
CREATE_TOKENS_FILES = True
PROCESS_DIR = '../2-Processed'

# read the config file
cfg = fh.read_config_file("all.yaml")

# read the train file
file_name = "../1-Input/trainingData-B.tsv"
file_type = 'B'
train_feat, labels, train_tweets = pipe.create_features(file_name, file_type, cfg)

# read the dev file
file_name = "../1-Input/devData-B.tsv"
file_type = 'B'
dev_feat, dev_labels, dev_tweets = pipe.create_features(file_name, file_type, cfg)

# read the test file
file_name = "../1-Input/testData-B.tsv"
file_type = 'B'
test_feat, gold, test_tweets = pipe.create_features(file_name, file_type, cfg)

Read 9683 rows from ../1-Input/trainingData-B.tsv
Read 1653 rows from ../1-Input/devData-B.tsv
Read 3812 rows from ../1-Input/testData-B.tsv


In [25]:
# create the featues for each lexicon
# random seed for all the operations
rnd_seed = 9000

# convert the list into and array that can be indexed
labels = np.array(labels)
dev_labels = np.array(dev_labels)
gold = np.array(gold)

# create the cleaned tweets
train_clean, dev_clean, vect = pipe.create_count_vec(train_tweets, dev_tweets, tokenizer=pipe.tokenize_clean_raw, stop_words=pipe.stop_words)
test_clean = vect.transform(test_tweets)
print "test shape:", test_clean.shape

train shape: (8171, 18094)
dev shape: (1405, 18094)
test shape: (3239, 18094)


In [28]:
# Auto select the best features
train_data, dev_data, selec = pipe.auto_select_features(pipe.chi2, 5, train_clean, labels, dev_clean, dev_labels)
test_data = selec.transform(test_clean)
print('Test Final Shape: {}'.format(test_data.shape))

# save the base features
pipe.dump_data(train_data, 'train_base_data.pck')
pipe.dump_data(dev_data, 'dev_base_data.pck')
pipe.dump_data(test_data, 'test_base_data.pck')
pipe.dump_data(labels, 'labels.pck')
pipe.dump_data(dev_labels, 'dev_labels.pck')
pipe.dump_data(gold, 'gold.pck')

Final shape (8171, 927) (1405, 927)
Test Final Shape: (3239, 927)


In [3]:
# using negated tokens
train_neg_tokens = [pipe.tokenize_negate_clean_raw(t) for t in train_tweets]
dev_neg_tokens = [pipe.tokenize_negate_clean_raw(t) for t in dev_tweets]
test_neg_tokens = [pipe.tokenize_negate_clean_raw(t) for t in test_tweets]

In [4]:
# not using negated tokens
train_tokens = [pipe.tokenize_clean_raw(t) for t in train_tweets]
dev_tokens = [pipe.tokenize_clean_raw(t) for t in dev_tweets]
test_tokens = [pipe.tokenize_clean_raw(t) for t in test_tweets]

In [10]:
from sklearn.feature_selection import chi2, mutual_info_classif

def select_features(lex, train_feat, labels, dev_feat, test_feat):
    """
    Select the most important features according to a given criteria.
    """
    tmp_train_feat, tmp_dev_feat, tmp_test_feat, _ = pipe.create_lex_vec(train_feat, dev_feat, test_feat)
    tmp_train_feat, tmp_dev_feat, selector = pipe.auto_select_features(mutual_info_classif, lex.selection_percent, 
                                                                       tmp_train_feat, labels, 
                                                                       tmp_dev_feat, None)
    tmp_test_feat = selector.transform(tmp_test_feat)
    return tmp_train_feat, tmp_dev_feat, tmp_test_feat
    
# generate each lexicon best features for the 3 datasets
def save_features(lex, train_tokens, labels, dev_tokens, dev_labels, test_tokens):
    """
    Generate the best features for each of the lexicons
    """
    # create the features from the datasets tokens
    if not lex.negated:
        train_feat = lex.process_lex(train_tokens, use_best_features=True)
        dev_feat = lex.process_lex(dev_tokens, use_best_features=True)
        test_feat = lex.process_lex(test_tokens, use_best_features=True)
    else:
        train_feat = lex.process_lex(train_neg_tokens, use_best_features=True)
        dev_feat = lex.process_lex(dev_neg_tokens, use_best_features=True)
        test_feat = lex.process_lex(test_neg_tokens, use_best_features=True)
    
    # select the top percent features if this is the case. Set the flag to create vector at save if selection occurs
    if lex.selection_percent:
        train_feat, dev_feat, test_feat = select_features(lex, train_feat, labels, dev_feat, test_feat)
    else:
        train_feat, dev_feat, test_feat,_ = pipe.create_lex_vec(train_feat, dev_feat, test_feat)
                
    # dump the info
    print 'saving {}\t{}\t{}'.format(train_feat.shape, dev_feat.shape, test_feat.shape)
    pipe.dump_lex_features(lex, train_feat, dev_feat, create_vec=False, test=test_feat)        

In [23]:
for lex in r.lexs:
    print 'Processing', lex.prefix
    save_features(lex, train_tokens, labels, dev_tokens, dev_labels, test_tokens)

Processing WNA
Final shape (8171, 91) (1405, 91)
saving (8171, 91)	(1405, 91)	(3239, 91)
Processing TSLEX
saving (8171, 7)	(1405, 7)	(3239, 7)
Processing SENT140
saving (8171, 7)	(1405, 7)	(3239, 7)
Processing NRCHASH
saving (8171, 7)	(1405, 7)	(3239, 7)
Processing MSOL
Final shape (8171, 319) (1405, 319)
saving (8171, 319)	(1405, 319)	(3239, 319)
Processing MPQA
saving (8171, 7)	(1405, 7)	(3239, 7)
Processing DAL
saving (8171, 5)	(1405, 5)	(3239, 5)
Processing BING
saving (8171, 7)	(1405, 7)	(3239, 7)
Processing ANEW
Final shape (8171, 181) (1405, 181)
saving (8171, 181)	(1405, 181)	(3239, 181)
Processing SENTN
Final shape (8171, 2572) (1405, 2572)
saving (8171, 2572)	(1405, 2572)	(3239, 2572)
Processing EMOLX
saving (8171, 50)	(1405, 50)	(3239, 50)
Processing SENTS
saving (8171, 5)	(1405, 5)	(3239, 5)
Processing LEW
saving (8171, 5)	(1405, 5)	(3239, 5)
Processing EMOSNET
saving (8171, 10)	(1405, 10)	(3239, 10)
Processing SSTREN
saving (8171, 1)	(1405, 1)	(3239, 1)


In [32]:
reload(r)
save_features(r.swn, train_tokens, labels, dev_tokens, dev_labels, test_tokens)

saving (8171, 8)	(1405, 8)	(3239, 8)
