In [1]:
import csv
import random
import math

In [3]:
# creating the function to load the dataset.
def LoadDataset(filename):
    dataset = []
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            dataset.append(row)
    return dataset

In [5]:

# then split the dataset into train, dev, and test sets
def split_train_dev_eval(dataset, split_ratio):
    train_size = int(len(dataset) * split_ratio) # split the train according to input ratio
    train_dataset = []
    devandeval = dataset[:]
    while len(train_dataset) < train_size:
        index = random.randrange(len(devandeval))
        train_dataset.append(devandeval.pop(index))
    return [train_dataset, devandeval, devandeval] #return list contains the three parts.

In [6]:
# function that calculates the word occurence in all reviews
def word_occurrences(dataset, vocabulary):
    WordOccurrences = {}
    for word in vocabulary:
        WordOccurrences[word] = 0
        for document in dataset:
            if word in document[0].split():
                WordOccurrences[word] += 1
    return WordOccurrences

In [7]:

# function that calculates the probability of each class (fresh or rotten)
def ClassProbabilities(dataset):

    class_probabilities = {}
    
    for document in dataset:
        sentiment = document[0]
        if sentiment == "Freshness": # ignoring the excess lables from the dataset.
            pass
        else:

            if sentiment not in class_probabilities:
                class_probabilities[sentiment] = 1 
            else:
                class_probabilities[sentiment] += 1 #counting the number of reviews by adding ones in each loop trun.
    total_documents = len(dataset) #the number of all reviews
    
    for sentiment in class_probabilities:
        class_probabilities[sentiment] /= total_documents
        
        
    return class_probabilities #returns dictionary of both classes probabilities.

In [8]:
# calculating the conditional probability of each word given the class.
def word_conditional_probability(dataset, vocabulary, word_occurrences):
    conditional_probabilities = {}
    total_words_per_class = {}
    for sentiment in ["fresh","rotten"]:
        total_words_per_class[sentiment] = 0
        for word in vocabulary:
            total_words_per_class[sentiment] += word_occurrences[word][sentiment] + 1
    for word in vocabulary:
        conditional_probabilities[word] = {}
        for sentiment in ["fresh","rotten"]:
            conditional_probabilities[word][sentiment] = (word_occurrences[word][sentiment] + 1) / total_words_per_class[sentiment]
    return conditional_probabilities #returns dictionary of each word and its corrseponding probability in both classes.

In [11]:
# here we make predictions according to the model we created.
def predict(document, vocabulary, class_probabilities, conditional_probabilities):
    words = document.split()
    positive_probability = math.log(class_probabilities['fresh'])
    negative_probability = math.log(class_probabilities['rotten'])
    for word in words:
        if word in vocabulary:
            positive_probability += math.log(conditional_probabilities[word]['fresh'])
            negative_probability += math.log(conditional_probabilities[word]['rotten'])
    if positive_probability > negative_probability:
        return 'fresh'
    else:
        return 'rotten'
   

In [17]:
# then lets evaluate our model.
def evaluate(dataset, vocabulary, class_probabilities, conditional_probabilities):
    correct_predictions = 0
    for document in dataset:
        predicted_sentiment = predict_sentiment(document[1], vocabulary, class_probabilities, conditional_probabilities)
        if predicted_sentiment == document[0]:
            correct_predictions += 1
    accuracy = correct_predictions / len(dataset)
    return accuracy

In [18]:
# the main function
def main():
    
    #loading the dataset
    dataset = LoadDataset('rt_reviews.csv')

    # Split the dataset into train, dev, and test sets
    train_set, dev_set, test_set = split_train_dev_eval(dataset, 0.6)
    
    vocab_list = ["good","bad","enjoyable","fresh","clean","fun","dark","black","harsh","low"] #pick the top 10 perfect word for the model.
    
    #calculatingthe word occurrence
    
    word_occurrences = {}
    for word in vocabulary: #     positive     neg1tive
        word_occurrences[word] = {'fresh': 0, 'rotten': 0}
        for document in train_set:
            if word in document[1].split():
                sentiment = document[0]
                if sentiment=="Freshness":
                    pass
                else:
                    word_occurrences[word][sentiment] += 1
    # lets do the calculations
    # Calculate ClassProbabilities
    class_probabilities = ClassProbabilities(train_set)
    print("Class Probabilities:",class_probabilities) # prior probability.
    
    conditional_probabilities = word_conditional_probability(train_set, vocabulary, word_occurrences)
    print("Conditional Probabilities:",conditional_probabilities)
    # Evaluate the accuracy of the classifier on the development set
    dev_accuracy = evaluate(dev_set, vocabulary, class_probabilities, conditional_probabilities)
    print("Accuracy:",dev_accuracy)
    

In [19]:
# run the code
main()


<class 'FileNotFoundError'>: [Errno 44] No such file or directory: 'rt_reviews.csv'