In [1]:
import numpy as np
import nltk

from file_parsing import parse_file, rating_review_split
from grammatical_tools import stops, word_split, negation_analyzer #probably dont need this anymore

In [2]:
def shuffle(list): #alternative to np.random.shuffle that is not in-place
	np.random.shuffle(list)
	return np.array(list)

In [3]:
all_vocabulary = open("aclImdb_v1\\aclImdb\\imdb.vocab", encoding = 'utf-8').read().splitlines()

#consider limiting the vocabulary to the first n terms, since these are the most common n terms

#lists for storing documents
train_pos = parse_file('train_pos_reviews.txt')
train_neg = parse_file('train_neg_reviews.txt')
dev_pos = parse_file('dev_pos_reviews.txt')
dev_neg = parse_file('dev_neg_reviews.txt')


print(len(train_pos), len(train_neg), len(dev_pos), len(dev_neg))

6250 6250 6250 6250


In [4]:
train_all = shuffle(train_pos + train_neg) #shuffles together positive and negative
dev_all = shuffle(dev_pos + dev_neg)

num_topics = 500 #hyperparameter
num_train_reviews = len(train_all)
num_dev_reviews = len(dev_all)
num_words = len(all_vocabulary)

train_ratings, train_reviews = rating_review_split(train_all)
dev_ratings, dev_reviews = rating_review_split(dev_all)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy import stats

normed_train_ratings = stats.zscore(train_ratings, axis=None) #uses z-scores to normalize ratings, (rating - mean)/stdev
normed_dev_ratings = stats.zscore(dev_ratings, axis=None)

train_polarities = [2*int(rating >= 7)-1 for rating in train_ratings]
dev_polarities = [2*int(rating >= 7)-1 for rating in dev_ratings]

In [6]:
def get_tf_idfs(doc_list): #calculates tf-idf scores for each doc, returns those and a list of words in all docs
	num_docs = len(doc_list)
	reviews = [doc[1] for doc in doc_list]
	
	vectorizer = TfidfVectorizer(strip_accents = 'ascii', lowercase=True, tokenizer = word_split, preprocessor=None, stop_words=None, vocabulary = all_vocabulary)
	#stop word removal is done by the tokenizer 
	#since this function likes to remove extra stopwords that i want to keep
	#like "n't"
	
	reviews = [doc[1] for doc in doc_list]
	
	tf_idfs = vectorizer.fit_transform(reviews)
	
	return tf_idfs

In [7]:
from sklearn.decomposition import LatentDirichletAllocation

def lda(tf_idfs, topics, test_tf_idfs): #does latent dirichlet analysis on tf-idf scores to find topics
	lda = LatentDirichletAllocation(n_components=topics, random_state=0)
	doc_topics = lda.fit_transform(tf_idfs)
	test_topics = lda.transform(test_tf_idfs)
	return lda.components_, doc_topics, test_topics

In [8]:
from sklearn.linear_model import LinearRegression

def find_polarity_topics(doc_topics, review_polarities):
	#linear regression with array of topic scores (X) and polarities (Y)
	reg = LinearRegression().fit(doc_topics, review_polarities)
	print(reg.score(doc_topics,review_polarities))
	return reg.coef_, reg.intercept_

In [9]:
def predict_polarity(coeffs, intercept, doc_topics): #uses topic weights to predict polarity from topics
	return np.dot(coeffs, doc_topics) + intercept

In [10]:
train_tf_idfs = get_tf_idfs(train_all) #get training data tf-idfs

dev_tf_idfs = get_tf_idfs(dev_all) #get development data tf-idfs

In [12]:
print(train_tf_idfs.shape)
print(dev_tf_idfs.shape)

(12500, 89527)
(12500, 89527)


In [13]:
topic_weights, doc_topics, test_doc_topics = lda(train_tf_idfs, num_topics, dev_tf_idfs)
#do lda on tf-idfs and apply the results to get topics for dev set

In [14]:
print(topic_weights.shape)
print(doc_topics.shape)
print(test_doc_topics.shape)

(500, 89527)
(12500, 500)
(12500, 500)


In [18]:
print(topic_weights)
print(doc_topics)
print(test_doc_topics)

[[0.002 0.002 0.002 ... 0.002 0.002 0.002]
 [0.002 0.002 0.002 ... 0.002 0.002 0.002]
 [0.002 0.002 0.002 ... 0.002 0.002 0.002]
 ...
 [0.002 0.002 0.002 ... 0.002 0.002 0.002]
 [0.002 0.002 0.002 ... 0.002 0.002 0.002]
 [0.002 0.002 0.002 ... 0.002 0.002 0.002]]
[[0.00027689 0.00027689 0.00027689 ... 0.00027689 0.00027689 0.02461868]
 [0.00019755 0.00019755 0.00019755 ... 0.00019755 0.00042552 0.00019755]
 [0.00025674 0.00025674 0.00025674 ... 0.00025674 0.01913308 0.00025674]
 ...
 [0.00019322 0.00019322 0.00019322 ... 0.00019322 0.00019322 0.00019322]
 [0.00026004 0.00026004 0.00026004 ... 0.00026004 0.00026004 0.00026004]
 [0.00581274 0.00014873 0.00014873 ... 0.00014873 0.00014873 0.00014873]]
[[0.00024695 0.00024695 0.00024695 ... 0.00024695 0.00024695 0.00024695]
 [0.00025525 0.00025525 0.00025525 ... 0.00025525 0.03289443 0.00025525]
 [0.00021064 0.00021064 0.00021064 ... 0.00021064 0.01984525 0.01986916]
 ...
 [0.00018347 0.00018347 0.00018347 ... 0.0089396  0.00018347 0.00988

In [15]:
coefs, y_intercept = find_polarity_topics(doc_topics, train_polarities)
#do linear regression to predict polarity from topics

0.14894748689174653


In [19]:
print(coefs)

[ 1.61409764e+12  1.22966845e+13  1.61409764e+12  1.61409764e+12
  1.61409764e+12  1.61409764e+12  1.61409764e+12  1.61409764e+12
 -5.89180274e+13  1.61409764e+12  1.61409764e+12  4.56694131e+12
  1.61409764e+12  1.61409764e+12  1.61409764e+12  1.61409764e+12
  1.61409764e+12  1.61409764e+12  1.61409764e+12  1.61409764e+12
  1.61409764e+12  1.61409764e+12  1.61409764e+12  1.61409764e+12
  1.61409764e+12  1.61409764e+12  1.61409764e+12  1.61409764e+12
  1.61409764e+12  1.61409764e+12  1.34124859e+13  1.61409764e+12
  1.61409764e+12  1.61409764e+12  1.61409764e+12  1.61409764e+12
  1.61409764e+12  1.61409764e+12  1.61409764e+12  1.61409764e+12
  1.61409764e+12  1.61409764e+12  1.61409764e+12  1.61409764e+12
  1.61409764e+12  1.61409764e+12  1.61409764e+12  1.61409764e+12
  1.61409764e+12  1.61409764e+12  1.61409764e+12  1.61409764e+12
  1.61409764e+12  1.61409764e+12  1.61409764e+12  1.61409764e+12
  1.61409764e+12  1.61409764e+12  1.61409764e+12  1.61409764e+12
  1.58575539e+13  1.61409

In [22]:
predicted_polarities = [predict_polarity(coefs, y_intercept, topics) > 0 for topics in test_doc_topics]
print(predicted_polarities)
actual_polarities = [rating >= 0 for rating in dev_polarities]
print(actual_polarities)
#convert sliding scale to discrete

[True, False, False, True, False, False, True, False, True, False, True, False, True, False, False, True, False, False, False, False, False, False, True, False, False, False, False, True, True, True, True, False, True, True, False, True, True, True, True, False, True, False, False, False, False, False, True, True, True, True, False, True, False, True, True, False, True, False, False, False, False, True, True, False, False, True, False, True, False, True, False, True, False, True, False, False, True, False, False, True, True, True, False, False, True, True, False, False, True, True, True, False, True, True, True, True, True, True, False, True, True, True, False, False, True, True, False, False, False, True, False, False, False, False, False, False, True, True, True, False, False, False, False, False, False, False, True, True, True, True, False, False, True, True, False, False, False, False, True, False, False, False, False, True, False, True, True, False, False, True, True, False, True,

In [23]:
from sklearn.metrics import precision_score, accuracy_score, recall_score

acc, pre, rec = accuracy_score(actual_polarities, predicted_polarities), precision_score(actual_polarities, predicted_polarities), recall_score(actual_polarities, predicted_polarities)

#print out statistical measures
print('accuracy %%%.2f' % (100*acc))
print('precision %%%.2f' % (100*pre)) 
print('recall %%%.2f' % (100*rec))
print('f-score %%%.2f' % (100* (2/(1/pre + 1/rec))))

from sklearn.metrics import confusion_matrix

print(confusion_matrix(actual_polarities, predicted_polarities))

accuracy %66.28
precision %67.10
recall %63.89
f-score %65.45
[[4292 1958]
 [2257 3993]]
