## Unigram and Bigram Induction
11/13/16 - Implements unigram and bigram induction.
Uses the IMDB dataset folder (http://ai.stanford.edu/~amaas/data/sentiment/).

In [121]:
from sentiment_utils import *
from collections import defaultdict
from nltk import word_tokenize
import math
import pickle as pickle
imdb_folder_location = "../aclImdb" # Change this to wherever your imbd folder is located

## Build Bigram and Unigram Maps

In [None]:
## Builds maps over the neighboring contexts. Each token is weighted based on its frequency in that context over the
# training set and its score.
pos_both_neighboring_bigrams = defaultdict(lambda : defaultdict(int))
pos_preceding_bigrams = defaultdict(lambda : defaultdict(int))
pos_following_bigrams = defaultdict(lambda : defaultdict(int))
pos_both_neighboring_unigrams = defaultdict(lambda : defaultdict(int))
for index, (filename, review, score) in enumerate(imdb_sentiment_reader(dataset_type='train', sentiment='pos')):
    tokens = word_tokenize(review.lower())
    for i in range(len(tokens)):
        # Get the preceding bigram if it exists
        preceding_bigram = (tokens[i-2], tokens[i-1]) if i >= 2 else None
        # Get the following bigram if it exists
        following_bigram = (tokens[i+1], tokens[i+2]) if i < len(tokens) - 2 else None
        # Get the preceding and following unigrams if they exist
        preceding_unigram = tokens[i-1] if i >= 1 else None
        following_unigram = tokens[i+1] if i < len(tokens) - 1 else None
        
        # Insert into the appropriate maps
        if preceding_bigram is not None and following_bigram is not None:
            pos_both_neighboring_bigrams[(preceding_bigram, following_bigram)][tokens[i]] += score
        if preceding_bigram is not None:
            pos_preceding_bigrams[preceding_bigram][tokens[i]] += score
        if following_bigram is not None:
            pos_following_bigrams[following_bigram][tokens[i]] += score
        if preceding_unigram is not None and following_unigram is not None:
            pos_both_neighboring_unigrams[(preceding_unigram, following_unigram)][tokens[i]] += score
    if index % 1000 == 0:
        print "Now on: " + str(index)



Now on: 0
Now on: 1000
Now on: 2000

In [None]:
## Builds maps over the neighboring contexts. Each token is weighted based on its frequency in that context over the
# training set and its score.
neg_both_neighboring_bigrams = defaultdict(lambda : defaultdict(int))
neg_preceding_bigrams = defaultdict(lambda : defaultdict(int))
neg_following_bigrams = defaultdict(lambda : defaultdict(int))
neg_both_neighboring_unigrams = defaultdict(lambda : defaultdict(int))
for index, (filename, review, score) in enumerate(imdb_sentiment_reader(dataset_type='train', sentiment='neg')):
    tokens = word_tokenize(review.lower())
    for i in range(len(tokens)):
        # Get the preceding bigram if it exists
        preceding_bigram = (tokens[i-2], tokens[i-1]) if i >= 2 else None
        # Get the following bigram if it exists
        following_bigram = (tokens[i+1], tokens[i+2]) if i < len(tokens) - 2 else None
        # Get the preceding and following unigrams if they exist
        preceding_unigram = tokens[i-1] if i >= 1 else None
        following_unigram = tokens[i+1] if i < len(tokens) - 1 else None
        
        # Insert into the appropriate maps
        if preceding_bigram is not None and following_bigram is not None:
            neg_both_neighboring_bigrams[(preceding_bigram, following_bigram)][tokens[i]] -= score
        if preceding_bigram is not None:
            neg_preceding_bigrams[preceding_bigram][tokens[i]] -= score
        if following_bigram is not None:
            neg_following_bigrams[following_bigram][tokens[i]] -= score
        if preceding_unigram is not None and following_unigram is not None:
            neg_both_neighboring_unigrams[(preceding_unigram, following_unigram)][tokens[i]] -= score
    if index % 1000 == 0:
        print "Now on: " + str(index)

In [None]:
# Combine the dictionaries
both_neighboring_bigrams = {"pos": pos_both_neighboring_bigrams, "neg": neg_both_neighboring_bigrams}
preceding_bigrams = {"pos": pos_preceding_bigrams, "neg": neg_preceding_bigrams}
following_bigrams = {"pos": pos_following_bigrams, "neg": neg_following_bigrams}
both_neighboring_unigrams = {"pos": pos_both_neighboring_unigrams, "neg": neg_both_neighboring_unigrams}

In [None]:
def induction_transform_func(filename, review, score):
    """
    Baseline: returns a review with 'not' inserted in front of any identified adjectives/adverbs.
    """
    def get_best_replacement(words_to_scores, score_type):
        """
        Attempts to find a replacement, but returns None if the replacement is not the correct part of speech
        """
        if score_type == "pos":
            return sorted(words_to_scores, key=words_to_scores.get, reverse=True)[0]
        else:
             return sorted(words_to_scores, key=words_to_scores.get)[0]
    
    score_type = "pos" if score < 7 else "neg" # We want the opposite review type
    upper_tokens = word_tokenize(review)
    tagged_review = nltk.pos_tag(upper_tokens)
    transformed_review = []
    tokens = [token.lower() for token in upper_tokens]
    if len(tokens) != len(tagged_review):
        # Return the original review
        return review
    for i, tagged_word in enumerate(tagged_review):
        # Attempt to find a replacement
        replacement_token = tagged_word[0]
        if tagged_word[1] in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']:
            # Get all the preceding unigrams_bigrams
            # Get the preceding bigram if it exists
            preceding_bigram = (tokens[i-2], tokens[i-1]) if i >= 2 else None
            # Get the following bigram if it exists
            following_bigram = (tokens[i+1], tokens[i+2]) if i < len(tokens) - 2 else None
            # Get the preceding and following unigrams if they exist
            preceding_unigram = tokens[i-1] if i >= 1 else None
            following_unigram = tokens[i+1] if i < len(tokens) - 1 else None    
            # Try each in turn, checking if we have the right Part of Speech
            if (preceding_bigram, following_bigram) in both_neighboring_bigrams[score_type]:
                replacement_token = get_best_replacement(both_neighboring_bigrams[score_type][(preceding_bigram, following_bigram)], score_type)
            elif (preceding_unigram, following_unigram) in both_neighboring_unigrams[score_type]:
                replacement_token = get_best_replacement(both_neighboring_unigrams[score_type][(preceding_unigram, following_unigram)], score_type)
            elif preceding_bigram in preceding_bigrams[score_type]:
                replacement_token = get_best_replacement(preceding_bigrams[score_type][preceding_bigram], score_type)
            elif following_bigram in following_bigrams[score_type]:
                replacement_token = get_best_replacement(following_bigrams[score_type][following_bigram], score_type)
        transformed_review.append(replacement_token)
        # Only reverse if new word has same POS as old word
    return " ".join(transformed_review)
# Example usage:
for (filename, review, score) in imdb_sentiment_reader(dataset_type='val', sentiment='pos'):
    print "Original review: "
    print review
    print "Transformed review:" 
    transformed = induction_transform_func(filename, review, score)
    print transformed
    break

In [None]:
train_reader = imdb_sentiment_reader(dataset_type='train', sentiment='both')
test_reader = imdb_sentiment_reader(dataset_type='val', sentiment='both')
default_evaluator = DefaultEvaluator(verbose=True)
baseline_runner = ExperimentRunner(train_reader, test_reader, induction_transform_func, 
                               evaluator=default_evaluator, verbose=True)
baseline_runner.run_experiment()