In [None]:
import nltk
from os import getcwd
import re
import string
import numpy as np
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer

# Download required NLTK data
nltk.download("twitter_samples")
nltk.download("stopwords")

# Load dataset
all_pos_twts = twitter_samples.strings('positive_tweets.json')
all_neg_twts = twitter_samples.strings('negative_tweets.json')

def process_tweet(tweet):
    """Cleans and preprocesses a tweet."""
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words("english")
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    return [stemmer.stem(word) for word in tweet_tokens if word not in stopwords_english and word not in string.punctuation]

def build_freqs(tweets, ys):
    """Builds a frequency dictionary mapping (word, sentiment) pairs to their counts."""
    yslist = np.squeeze(ys).tolist()
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            freqs[pair] = freqs.get(pair, 0) + 1
    return freqs

def extract_features(tweet, freqs):
    """Extracts features for a given tweet based on word frequencies."""
    word_l = process_tweet(tweet)
    x = np.zeros((1, 3))
    x[0, 0] = 1  # Bias term
    for word in word_l:
        x[0, 1] += freqs.get((word, 1.0), 0)
        x[0, 2] += freqs.get((word, 0.0), 0)
    return x

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def gradient_descent(x, y, theta, alpha, num_iter):
    """Performs gradient descent to optimize theta."""
    m = x.shape[0]
    for _ in range(num_iter):
        z = np.dot(x, theta)
        h = sigmoid(z)
        J = (-1/m) * (np.dot(y.T, np.log(h)) + np.dot((1-y).T, np.log(1-h)))
        theta -= (alpha/m) * (np.dot(x.T, (h - y)))
    return float(J), theta

# Prepare training and testing data
labels = np.append(np.ones(len(all_pos_twts)), np.zeros(len(all_neg_twts)))
train_x, test_x = all_pos_twts[:4000] + all_neg_twts[:4000], all_pos_twts[4000:] + all_neg_twts[4000:]
train_y, test_y = np.append(np.ones((len(train_x)//2, 1)), np.zeros((len(train_x)//2, 1)), axis=0), np.append(np.ones((len(test_x)//2, 1)), np.zeros((len(test_x)//2, 1)), axis=0)

# Build frequency dictionary
freqs = build_freqs(train_x, train_y)

# Train logistic regression model
X = np.array([extract_features(tweet, freqs) for tweet in train_x]).reshape(len(train_x), 3)
Y = train_y
J, theta = gradient_descent(X, Y, np.zeros((3, 1)), 1e-9, 1500)

def predict_tweet(tweet, freqs, theta):
    """Predicts the sentiment of a tweet."""
    x = extract_features(tweet, freqs)
    return sigmoid(np.dot(x, theta))

def test_logistic_regression(test_x, test_y, freqs, theta):
    """Evaluates model accuracy on test data."""
    y_hat = [1 if predict_tweet(tweet, freqs, theta) > 0.5 else 0 for tweet in test_x]
    return (np.array(y_hat) == np.squeeze(test_y)).sum() / len(test_y)

# Evaluate model accuracy
accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {accuracy:.4f}")

# Test on a sample tweet
my_tweet = 'This is a ridiculously bright movie. The plot was terrible and I was sad until the ending!'
print(f"Processed Tweet: {process_tweet(my_tweet)}")
y_hat = predict_tweet(my_tweet, freqs, theta)
print('Positive sentiment' if y_hat > 0.5 else 'Negative sentiment')
