In [49]:
import os
import math
from collections import defaultdict
import numpy as np
## I unzipped the tar.gz file as above. File name should be data in order to perform correctly
##Also for better perfomance, it should be run on google collab
##Arda Burak Yeni 76688



#import tarfile

# open file
#file = tarfile.open('aclImdb_v1.tar.gz')

# extracting file
##file.extractall('data')

#file.close()

# I created a set for words which doesnt have sentimental efffect for text
stop_words = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves",
    "you", "your", "yours", "yourself", "yourselves", "he", "him",
    "his", "himself", "she", "her", "hers", "herself", "it",
    "its", "itself", "they", "them", "their", "theirs", "themselves",
    "what", "which", "who", "whom", "this", "that", "these", "those",
    "am", "is", "are", "was", "were", "be", "been", "being", "have",
    "has", "had", "having", "do", "does", "did", "doing", "a", "an",
    "the", "and", "but", "if", "or", "because", "as", "until",
    "while", "of", "at", "by", "for", "with", "about", "against",
    "between", "into", "through", "during", "before", "after",
    "above", "below", "to", "from", "up", "down", "in", "out",
    "on", "off", "over", "under", "again", "further", "then",
    "once", "here", "there", "when", "where", "why", "how", "all",
    "any", "both", "each", "few", "more", "most", "other",
    "some", "such", "no", "nor", "not", "only", "own", "same",
    "so", "than", "too", "very", "s", "t", "can", "will", "just",
    "don", "should", "now", "d", "ll", "m", "o", "re", "ve", "y",
    "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn",
    "haven", "isn", "ma", "mightn", "mustn", "needn", "shan",
    "shouldn", "wasn", "weren", "won", "wouldn"
])

# This function loads the positive and negative datasets
def load_data(data_dir):
    texts = []
    sentiments = []

    for sentiment in ['pos', 'neg']:
        sentiment_dir = os.path.join(data_dir, sentiment)
        for filename in os.listdir(sentiment_dir):
            with open(os.path.join(sentiment_dir, filename), 'r', encoding='utf-8') as file:
                texts.append(file.read())
                sentiments.append('positive' if sentiment == 'pos' else 'negative')

    return texts, sentiments

# I initiliazed the train and test datasets
train_texts, train_sentiments = load_data('data/aclImdb/train')
test_texts, test_sentiments = load_data('data/aclImdb/test')

# Tokenization and preprocessing
def preprocessing(text):
    # tokinizind and making lowerspace
    tokens = text.lower().split()
    new_tokens = []
    for token in tokens:
        if token not in stop_words:
           new_tokens.append(token)

    return new_tokens

# This functions builds a vocabulary with frequency threshold and for the words below threshold it puts "unk"
def build_vocabulary(texts, frequency_threshold=5):
    word_freq = {}

    for text in texts:
        tokens = preprocessing(text)
        for token in tokens:
            if token in word_freq:
                word_freq[token] += 1
            else:
                word_freq[token] = 1

    vocab = {'padding': 0, 'unk': 1}
    word_index = 2

    for word, freq in word_freq.items():
        if freq >= frequency_threshold:
            vocab[word] = word_index
            word_index += 1

    return vocab



#This functions converts texts to the ingetegers which explained in descriptions
def texts_to_sequences(texts, vocab):
    sequences = []
    for text in texts:
        tokens = preprocessing(text)
        sequence = []
        for token in tokens:
            # Tokens are being converted to their index in vocab
            if token in vocab:
                index = vocab[token]
            else:
                index = vocab['unk'] # unk if token is not in vocab

            sequence.append(index)

        sequences.append(sequence)

    return sequences

# For better accuracy I cropped the sequences like mentioned in the descriptions
def pad_sequences(sequences, max_length):
    padded_sequences = []

    for seq in sequences:
        if len(seq) > max_length:
            padded_seq = seq[:max_length]  # Crop longer sequences
        else:
            padded_seq = seq + [0] * (max_length - len(seq))  # Pad shorter with "padding"

        padded_sequences.append(padded_seq)

    return np.array(padded_sequences)

# 1. Build the vocabulary
vocab = build_vocabulary(train_texts, frequency_threshold=5)

# 2. converting sets to the integer sequences
train_sequences = texts_to_sequences(train_texts, vocab)
test_sequences = texts_to_sequences(test_texts, vocab)

# 3. fixed length for cropping
#for max_length 50 -> 79 accuracy
#for 200 -> 83.9%
#300->83.92
#400 -> 83.7
max_length = 200
train_padded = pad_sequences(train_sequences, max_length)
test_padded = pad_sequences(test_sequences, max_length)


def calculate_word_counts(sequences, sentiments, vocab):

    word_count = defaultdict(lambda: defaultdict(int))


    for i in range(len(sequences)):
        seq = sequences[i]  # Get the current sequence
        sentiment = sentiments[i]  # Get the corresponding sentiment

        for word_idx in seq:

            if word_idx != 0:  # If the word index is not padding
                # Increment the count for this word index and sentiment class
                word_count[word_idx][sentiment] += 1

    return word_count
word_counts = calculate_word_counts(train_padded, train_sentiments, vocab)

# 5. Calculate log-likelihood and log-priors for each class
def calculating_loglikelihood(word_counts, total_word_count, vocab_size, laplacian_smoothing=1):
    log_likelihood = defaultdict(lambda: defaultdict(float))

    for word_idx, class_count in word_counts.items():
        for sentiment in ['positive', 'negative']:
            # Laplician smoothing for 0 probabilities
            log_likelihood[word_idx][sentiment] = math.log((class_count[sentiment] + laplacian_smoothing) /
                                                           (total_word_count[sentiment] + laplacian_smoothing * vocab_size))

    return log_likelihood

#I initiliazed dictionary for sentiment count for both negative and positive
total_word_count = {
    'positive': 0,  # Initialize total for positive words
    'negative': 0   # Initialize total for negative words
}

# Iterate through words
for word_index, counts in word_counts.items():

    if 'positive' in counts:
        total_word_count['positive'] += counts['positive']
    if 'negative' in counts:

        total_word_count['negative'] += counts['negative']



# Calculate log likelihoods
log_likelihood = calculating_loglikelihood(word_counts, total_word_count, len(vocab))

# Calculate log priors
def calculate_log_prior(sentiments):
    total_texts = len(sentiments)
    log_prior_positive = math.log(sentiments.count('positive') / total_texts)
    log_prior_negative = math.log(sentiments.count('negative') / total_texts)

    return log_prior_positive, log_prior_negative

log_prior_positive, log_prior_negative = calculate_log_prior(train_sentiments)

# 6. Predicting sentiment
def predict(sequence, log_likelihood, log_prior_positive, log_prior_negative):
    log_score_positive = log_prior_positive
    log_score_negative = log_prior_negative

    for word_idx in sequence:
        if word_idx != 0:  # Ignore padding
            log_score_positive += log_likelihood[word_idx]['positive']
            log_score_negative += log_likelihood[word_idx]['negative']

    return 'positive' if log_score_positive > log_score_negative else 'negative'

# 7. The accuracy calculation
correct_predictions = 0

total_samples = len(test_sentiments)
for i in range(total_samples):
    sequence = test_padded[i]
    true_sentiment = test_sentiments[i]


    predicted_sentiment = predict(sequence, log_likelihood, log_prior_positive, log_prior_negative)
    if predicted_sentiment == true_sentiment:
        correct_predictions += 1
accuracy = (correct_predictions / total_samples) * 100

print(f'Accuracy: {accuracy:.2f}%')



Accuracy: 83.90%
