In [31]:
train_file_path = 'data/sst-sentiment-text-threeclass/train.txt'
test_file_path = 'data/sst-sentiment-text-threeclass/test.txt'
dev_file_path = 'data/sst-sentiment-text-threeclass/dev.txt'

In [32]:
import numpy as np
import pandas as pd

# LOAD DATA

In [33]:
def process_text_file(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    lines = [line.strip() for line in lines]
    lines = [line.split('|||') for line in lines]
    lines = [[int(line[0]), line[1].strip().lower()] for line in lines]

    sentiments = [line[0] for line in lines]
    reviews = [line[1] for line in lines]
    sentiments, reviews = np.array(sentiments), np.array(reviews)
    return sentiments, reviews

In [34]:
train_sentiments, train_reviews = process_text_file(train_file_path)
test_sentiments, test_reviews = process_text_file(test_file_path)
dev_sentiments, dev_reviews = process_text_file(dev_file_path)

# FEATURE EXTRACTION

In [35]:
# write very basic rule based feature extractor

good_words = ['love', 'good', 'like']
bad_words = ['hate', 'bad', 'dislike']

def rule_based_feature_extractor(review):
    tokens = review.split(' ')
    good_word_count, bad_word_count = 0, 0
    for token in tokens:
        if token in good_words:
            good_word_count += 1
        elif token in bad_words:
            bad_word_count += 1
    return [good_word_count, bad_word_count, 1] # 1 is for bias

# write a function to extract features for all reviews

def extract_features(reviews, feature_weights=[1, -1, 0.5]):
    features = []
    for review in reviews:
        features.append(rule_based_feature_extractor(review))
    features = np.array(features)
    return np.dot(features, feature_weights)

In [36]:
Xtrain, Ytrain = extract_features(train_reviews), train_sentiments
Xtest, Ytest = extract_features(test_reviews), test_sentiments
Xdev, Ydev = extract_features(dev_reviews), dev_sentiments

# RULE BASED DECISSION RULER

In [37]:
def rule_based_classifier(x):
    if x > 0:
        return 1
    elif x < 0:
        return -1
    else:
        return 0

# EVALUATION

In [38]:
def accuracy(Y, X):
    Yhat = np.array([rule_based_classifier(x) for x in X])
    return np.mean(Y == Yhat)

In [39]:
accuracy(Ytrain, Xtrain), accuracy(Ytest, Xtest), accuracy(Ydev, Xdev)

(0.4324672284644195, 0.4244343891402715, 0.41689373297002724)