Drew Lickman\
CSCI 4820-001\
Project #3\
Due: 10/9/24

AI Usage Disclaimer:


# Lexicon-Based Sentiment Analysis using Custom Logistic Regression

## Assignment Requirements:

### Input
---

- Positive words
- Negative words
- IMDb reviews

### Processing
---

- There are two classifiers
	- Custom Logistic Regression
	- sklearn LogisticRegression
- Implement a Python class (CustomLogisticRegression)
	- \__init\__(self, learning_rate, num_iters) method
		- self.learning_rate
		- self.num_iters
		- self.weights = None
		- self.bias = None
	- sigmoid(z)
		- return result
	- fit(X, y)
		- Sets weights to correct shape and initializes them to 0
		- Applies batch gradient descent to the entire dataset in a loop for num_iters
	- predict(X)
		- z = w dot x + b
		- return sigmoid(z) 

### Output
---

- For each trial and for each classifier
	- Print the sklearn confusion_matrix and classification_Report
- Output the average of the confusion matrices across trials for each classifier

## Python Code

Markdown above each cell used to explain each block of code

1. Load and preprocess IMDb reviews

In [None]:
import numpy as np

sentimentWords = {}
posWords = {}
negWords = {}
with open("positive-words.txt", encoding="utf-8") as positivewords:
	lines = positivewords.readlines()
	for line in lines:
		if line[0] != ";" and line.strip() != '': 
			posWords[line.rstrip('\n')] = 1
with open("negative-words.txt", encoding="utf-8") as negativewords:
	lines = negativewords.readlines()
	for line in lines:
		if line[0] != ";" and line.strip() != '':
			negWords[line.rstrip('\n')] = 1
sentimentWords = {**posWords, **negWords} # Combine positive words and negative words into one dictionary
#print(sentimentWords)

# Add each line of the IMDb reviews to the reviews array
reviews = []
trueValues = []
with open("imdb_reviews.txt", encoding="utf-8") as imdbreviews:
	lines = imdbreviews.readlines()
	for line in lines:
		splitLine = line.rstrip().rsplit(' ', 1)
		reviews.append(splitLine[0]) # removes true sentiment label from data

		sentiment = splitLine[1].strip()[-8:] # the last 8 characters are either positive or negative
		if sentiment == "positive":
			trueValues.append(1)
		elif sentiment == "negative":
			trueValues.append(0)
		else:
			print("Error: sentiment analysis not found at end of line!")
print(reviews[0])
print(trueValues)

2. Create Features(X) table and Labels(y) array

In [None]:
# X and y need to be np.arrays
X = np.zeros((len(reviews), len(sentimentWords)), dtype=bool) 	# Features
y = np.zeros(len(reviews), dtype=int) 		# Labels
posCount = 0
negCount = 0

for review in range(len(reviews)):
	for posWord in posWords:
		if posWord in reviews[review]: #make two loops, one for pos/neg
			X[review, posWords[posWord]] = True
			#print(f"Positive: {sentimentWords[word]}")
			posCount += 1 	
	for negWord in negWords:
		if negWord in reviews[review]:
			X[review, negWords[negWord]] = False
			#print(f"Negative: {sentimentWords[word]}")
			negCount += 1

	#print(posCount, negCount)
	if posCount >= negCount:
		y[review] = 1
	else:
		y[review] = 0
	posCount = 0
	negCount = 0

print(X.shape)
print(y.shape)
# for review in range(10):
# 	print(y[review], end=" ")
# 	print(X[review])
	#if y[review]:
		#print(f"Review {review} is positive!")

# Compare sentiment count compared to true value
count = 0
for i in range(len(y)):
	#print(y[i], end=" ")
	#print(trueValues[i])
	if y[i] == trueValues[i]: # Count how many pos/negCount labels match the trueValue in each review line
		count+=1
print(f"{count} out of {len(y)} are matching. {count/len(y)*100}%")


3. Define Custom Logistic Regression class

In [None]:
class CustomLogisticRegression():
	# Constructor
	def __init__(self, learning_rate, num_iters): 
		self.learning_rate = learning_rate
		self.num_iters = num_iters
		self.weights = None
		self.bias = None

	# Train the model using gradient descent
	# X is training features, y is labels
	def fit(self, X, y):
		# Sets the weights to the correct shape and initializes them to 0
		features = X.shape[1]
		self.weights = np.zeros(features) # weight for each feature
		self.bias = 0

		# Apply batch gradient descent on entire dataset
		# Gradient descent
		# This for loop was written by Claude 3.5 Sonnet and modified by myself
		for _ in range(self.num_iters):
			predictions = self.sigmoid(self.linearTransform(X)) # Calculate array of sigmoidal probabilities
			error = predictions - y # Calculate the difference between predicted and actual labels

			# Compute gradient for weights
			dw = (1 / len(y)) * np.dot(X.T, error) # X.T is transposed 
			# Compute gradient for bias
			db = (1 / len(y)) * np.sum(error) # Average of all errors

			# Update weights and biases
			self.weights -= self.learning_rate * dw
			self.bias -= self.learning_rate * db

	# Inputs either scalar or array and outputs sigmoid function of the scalar or array
	def sigmoid(self, z):
		output = 1 / (1 + np.exp(-z)) # np.exp does e^(-z) for all samples in the reviews array
		return output

	# Calculate probability of a sample being a class (positive or negative)
	def predict(self, X):
		z = self.linearTransform(X)
		prob = self.sigmoid(z)
		prob = int(prob >= 0.5) # Convert to binary output
		return prob
	
	# Function for X dot W + b
	def linearTransform(self, X):
		z = np.dot(X, self.weights) + self.bias
		return z

scikit-learn documentation
	
	- https://scikit-learn.org/1.5/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.predict

	- https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.confusion_matrix.html
	
	- https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.classification_report.html

In [None]:
from sklearn import linear_model as lm
from sklearn import model_selection as ms
from sklearn import metrics

# Initialize variables to store average confusion matrices
avgConfusionMatrix_skllr = np.zeros((2, 2))
avgConfusionMatrix_mylr = np.zeros((2, 2))

trialCount = 5
iterationCount = 1
for trial in range(trialCount):
	# Shuffle input data
	# Split into 80% 20% split of training and test sets
	# Line from Claude 3.5 Sonnet
	X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2, random_state=trial, shuffle=True)
	

	skllr = lm.LogisticRegression(solver='sag', C=0.001, max_iter=iterationCount)
	skllr.fit(X_train, y_train) # Only use the 80% of the data marked for training
	skllrPredictions = skllr.predict(X_test) # Use the remaining 20% of the data marked for testing
	
	mylr = CustomLogisticRegression(learning_rate=0.1, num_iters=iterationCount)
	mylr.fit(X_train, y_train) # Only use the 80% of the data marked for training
	mylrPredictions = np.array([mylr.predict(x) for x in X_test]) # Use the remaining 20% of the data marked for testing
	
	# for i in skllrPredictions:
	# 	print(skllrPredictions[i], end=" ")
	# 	print(mylrPredictions[i])
	
	# Evaluate sklearn model
	print(f"Trial {trial + 1} - Sklearn LogisticRegression:")
	print(metrics.confusion_matrix(y_test, skllrPredictions))
	print(metrics.classification_report(y_test, skllrPredictions, target_names=["Positive", "Negative"]))

	# Evaluate custom model
	print(f"Trial {trial + 1} - Custom LogisticRegression:")
	print(metrics.confusion_matrix(y_test, mylrPredictions))
	print(metrics.classification_report(y_test, mylrPredictions, target_names=["Positive", "Negative"]))

	# Update average confusion matrices
	avgConfusionMatrix_skllr += metrics.confusion_matrix(y_test, skllrPredictions)
	avgConfusionMatrix_mylr += metrics.confusion_matrix(y_test, mylrPredictions)
# Calculate and print average confusion matrices
avgConfusionMatrix_skllr /= 5
avgConfusionMatrix_mylr /= 5

# After all trials are completed, print average of the trials
print("Average Confusion Matrix - Sklearn LogisticRegression:")
print(avgConfusionMatrix_skllr)

print("Average Confusion Matrix - Custom LogisticRegression:")
print(avgConfusionMatrix_mylr)