In [1]:
import csv
import random
import math

In [2]:
# creating the function to load the dataset.
def LoadDataset(filename):
    dataset = []
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            dataset.append(row)
    return dataset

In [3]:
def reviews_load(filename):
    reviews = []
    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            reviews.append(row)
    return reviews

In [4]:

# then split the dataset into train, dev, and test sets
def split_train_dev_eval(dataset, split_ratio):
    train_size = int(len(dataset) * split_ratio) # split the train according to input ratio
    train_dataset = []
    devandeval = dataset[:]
    while len(train_dataset) < train_size:
        index = random.randrange(len(devandeval))
        train_dataset.append(devandeval.pop(index))
    return [train_dataset, devandeval, devandeval] #return list contains the three parts.

In [5]:
# function that calculates the word occurence in all reviews
def word_occurrences(dataset, vocabulary):
    WordOccurrences = {}
    for word in vocabulary:
        WordOccurrences[word] = 0
        for document in dataset:
            if word in document[0].split():
                WordOccurrences[word] += 1
    return WordOccurrences

In [6]:

# function that calculates the probability of each class (fresh or rotten)
def ClassProbabilities(dataset):

    class_probabilities = {}
    
    for document in dataset:
        sentiment = document[0]
        if sentiment == "Freshness": # ignoring the excess lables from the dataset.
            pass
        else:

            if sentiment not in class_probabilities:
                class_probabilities[sentiment] = 1 
            else:
                class_probabilities[sentiment] += 1 #counting the number of reviews by adding ones in each loop trun.
    total_documents = len(dataset) #the number of all reviews
    
    for sentiment in class_probabilities:
        class_probabilities[sentiment] /= total_documents
        
        
    return class_probabilities #returns dictionary of both classes probabilities.

In [7]:
# calculating the conditional probability of each word given the class.
def word_conditional_probability_smoothing(dataset, vocabulary, word_occurrences,alpha):
    conditional_probabilities = {}
    total_words_per_class = {}
    for sentiment in ["fresh", "rotten"]:
        total_words_per_class[sentiment] = sum(word_occurrences[word][sentiment] for word in vocabulary) + alpha * len(
            vocabulary)
        for word in vocabulary:
            if word not in conditional_probabilities:
                conditional_probabilities[word] = {
                    sentiment: (word_occurrences[word][sentiment] + alpha) / total_words_per_class[sentiment]}
            else:
                conditional_probabilities[word][sentiment] = (word_occurrences[word][sentiment] + alpha) / \
                                                             total_words_per_class[sentiment]
    return conditional_probabilities
 #returns dictionary of each word and its corrseponding probability in both classes.
    
def calculate_conditional_probabilities(dataset, vocabulary, word_occurrences):
    conditional_probabilities = {}
    total_words_per_class = {}
    for sentiment in ["fresh","rotten"]:
        total_words_per_class[sentiment] = 0
        for word in vocabulary:
            total_words_per_class[sentiment] += word_occurrences[word][sentiment] + 1
    for word in vocabulary:
        conditional_probabilities[word] = {}
        for sentiment in ["fresh","rotten"]:
            conditional_probabilities[word][sentiment] = (word_occurrences[word][sentiment] + 1) / total_words_per_class[sentiment]
    return conditional_probabilities
 #returns dictionary of each word and its corrseponding probability in both classes.

In [8]:
# here we make predictions according to the model we created.
def predict(document, vocabulary, class_probabilities, conditional_probabilities):
    words = document.split()
    positive_probability = math.log(class_probabilities['fresh'])
    negative_probability = math.log(class_probabilities['rotten'])
    for word in words:
        if word in vocabulary:
            positive_probability += math.log(conditional_probabilities[word]['fresh'])
            negative_probability += math.log(conditional_probabilities[word]['rotten'])
    if positive_probability > negative_probability:
        return 'fresh'
    else:
        return 'rotten'
   

In [9]:
# then lets evaluate our model.
def evaluate(dataset, vocabulary, class_probabilities, conditional_probabilities):
    correct_predictions = 0
    for document in dataset:
        predicted_sentiment = predict(document[1], vocabulary, class_probabilities, conditional_probabilities)
   
        if predicted_sentiment == document[0]:
            correct_predictions += 1
    accuracy = round(correct_predictions / len(dataset),2)
    return accuracy

In [11]:
# the main function
def main():
    
    #first calculating the top repeated words.
  
    dataset=reviews_load('rt_reviews.csv')
    word_counts = {}
    for review in dataset:

        words = review['Review'].split()
        for word in words:
            if word in word_counts:
                word_counts[word] += 1
            else:
                word_counts[word] = 1
    sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    vocab_list=[]
    for i in range(10000):
        if not sorted_word_counts[i][1] < 5 : #checking if the word is repeated more than five times.
            vocab_list.append(sorted_word_counts[i][0]) #buildig vocab list of top ten thousand word in the dataset.
      

    
    #loading the dataset  
    
    dataset = LoadDataset('rt_reviews.csv')
    #calculating the
    word_the_occurrences_in_docs=0
    for document in dataset:
        document=document[1].split()
        if "the" in document:
            word_the_occurrences_in_docs+=1

    Probability_the_occurrence=word_the_occurrences_in_docs/len(dataset)
    print("Probability of 'the' occurrence=",Probability_the_occurrence)

    cond_prob=0
    positive_doc=0
    for document in dataset:

        if document[0]=="fresh":
            positive_doc+=1
            document=document[1].split()
            if "the" in document:
                cond_prob+=1

    Probability_conditional = cond_prob / positive_doc
    print("Conditional probability based on the sentiment=", Probability_conditional)
    
    #start preparing the dataset
    # Split the dataset into train, dev, and test sets
    train_set, dev_set, test_set = split_train_dev_eval(dataset, 0.6)
     
    #calculatingthe word occurrence
    
    word_occurrences = {}
    for word in vocab_list: #     positive     neg1tive
        word_occurrences[word] = {'fresh': 0, 'rotten': 0}
        for document in train_set:
            if word in document[1].split():
                sentiment = document[0]
                if sentiment=="Freshness":
                    pass
                else:
                    word_occurrences[word][sentiment] += 1
    # lets do the calculations
    # Calculate ClassProbabilities
    class_probabilities = ClassProbabilities(train_set)
    print("Class Probabilities:",class_probabilities) # prior probability.
    alpha=100 #adding the smoothing way
    
    conditional_probabilities = calculate_conditional_probabilities(train_set, vocab_list, word_occurrences)
    
    conditional_probabilities_smoothing = word_conditional_probability_smoothing(train_set, vocab_list, word_occurrences,alpha)
    
    # Evaluate the accuracy of the classifier on the development set
    dev_accuracy = evaluate(dev_set, vocab_list, class_probabilities, conditional_probabilities)
  
    dev_accuracy_smoothing = evaluate(dev_set, vocab_list, class_probabilities, conditional_probabilities_smoothing)
    
    effect_of_smoothing=abs(dev_accuracy_smoothing-dev_accuracy)
    
    print("Accuracy of dev set:",dev_accuracy)
    print("Effect of Smoothing on dev set:",effect_of_smoothing)
    
    test_accuracy_smoothing = evaluate(test_set, vocab_list, class_probabilities, conditional_probabilities_smoothing) #conducting the accuracy on test set using hyperparameters.
    print("Accuracy of test set using optimal hyperparameters:",test_accuracy_smoothing)
    
    
    word_counts_fresh = {}
    word_counts_rotten = {}
    dataset=reviews_load('rt_reviews.csv')
    for review in dataset:
        if review["Freshness"]=="fresh":
            words = review['Review'].split()
            for word in words:
                if word in word_counts_fresh:
                    word_counts_fresh[word] += 1
                else:
                    word_counts_fresh[word] = 1
        else:
            words = review['Review'].split()
            for word in words:
                if word in word_counts_rotten:
                    word_counts_rotten[word] += 1
                else:
                    word_counts_rotten[word] = 1
                    
    sorted_word_counts_fresh = sorted(word_counts_fresh.items(), key=lambda x: x[1], reverse=True)
    sorted_word_counts_rotten = sorted(word_counts_rotten.items(), key=lambda x: x[1], reverse=True)
    vocab_fresh=[]
    vocab_rotten=[]
    for i in range(10):
        vocab_fresh.append(sorted_word_counts_fresh[i][0])
        vocab_rotten.append(sorted_word_counts_rotten[i][0])
        
    print("Top 10 most frequent words:")
    print("In Fresh:",vocab_fresh)
    print("In Rotten:",vocab_rotten)
    
# run the code
main()

Probability of 'the' occurrence= 0.5705550613436222
Conditional probability based on the sentiment= 0.5793541666666666
Class Probabilities: {'rotten': 0.4987326388888889, 'fresh': 0.5012673611111111}
Accuracy of dev set: 0.77
Effect of Smoothing on dev set: 0.020000000000000018
Accuracy of test set using optimal hyperparameters: 0.75
Top 10 most frequent words:
In Fresh: ['the', 'and', 'a', 'of', 'to', 'is', 'in', 'that', 'with', 'it']
In Rotten: ['the', 'a', 'of', 'and', 'to', 'is', 'in', 'that', 'it', 'The']
