In [9]:
import nltk
import os

In [10]:
import pickle
import csv

In [11]:
# Check for file

if os.path.exists("./model.pickle"):
    #if it exists,load the data
    model = open('./model.pickle','rb')
    classifier = pickle.load(model)
    model.close()
    
    #then, display 5 most informative features
    classifier.show_most_informative_features(5)
else:
    # If it does not exist, Train a model
    
    
    # 1. Load the dataset from the csv file
    dataset = []

    with open('./dataset.csv', encoding= "utf-8-sig") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            dataset.append((row['text'],row['label']))
    #                           ^  its the header of the file
    
    # 2. Preprocess the data
        # Tokenize the data
    from nltk.tokenize import word_tokenize
        
    list_words = []

    for sentence, label in dataset:
        words = word_tokenize(sentence)
        list_words.extend(words)
        
        # Remove stopwords, symbol & numbers
    from nltk.corpus import stopwords

    def clean_text(input):
        # make lower
        input = [word.lower() for word in input]
        # remove symbols
        input = [word for word in input if word.isalpha()]
        # remove stopwords
        stop_words = set(stopwords.words('english'))
        input = [word for word in input if not word in stop_words]
        # remove numbers
        input = [word for word in input if not word.isdigit()] ##! isnumeric() can also work
        # remove punctuation
        from string import punctuation
        input = [word for word in input if not word in punctuation]
        return input
    
        #clean data w/ func
    list_words = clean_text(list_words)
        
        # lemmatize / stem the data
        
        # lemmatize:
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
        
    def lemmatize(input):
        lemmas = [lemmatizer.lemmatize(word) for word in input]
        return lemmas
    
    list_words = lemmatize(list_words)
    
    
    # Model training (?)
        # A. use nltk.FreqDist to get the most common words
    from nltk.probability import  FreqDist
    fd = FreqDist(list_words)
    # use the 1000 most common words as features
    list_words = [word for word, count in fd.most_common(1000)]
    
        # B. Create a feature set
    processed_dataset = []
    for sentence, label in dataset:
        words = word_tokenize(sentence)
        words = clean_text(words)
        words = lemmatize(words)
        dict_features = {}
        for feature in list_words:
            dict_features[feature] = (feature in words)
        processed_dataset.append((dict_features,label))
        
        # C. Split the dataset into training and testing
    import random
    
    random.shuffle(processed_dataset)
    split_index = int(len(processed_dataset)* 0.7)
    training_data = processed_dataset[:split_index]
    testing_data = processed_dataset[split_index:]
    
        # D. Train the model
    from nltk.classify import NaiveBayesClassifier, accuracy
    classifier = NaiveBayesClassifier.train(training_data)
        
    
    
        # 3. Classify reviews as either Positive or Negative
    test_input = "Absolutely blown away by the food here! Service was top-notch, and the ambiance made it feel like I was dining in Italy. I will definitely be coming back!"
    test_input_token = word_tokenize(test_input)
    test_input_token = clean_text(test_input_token)
    test_input_token = lemmatize(test_input_token)
    
    test_input_result = classifier.classify(FreqDist(test_input_token))
    print(test_input_result)
    
        # 4.  Show the 5 most informative features and the training accuracy
    classifier.show_most_informative_features(5)    
    print(accuracy(classifier,testing_data))
    
        # 5. Save the model as model.pickle
    model = open('./model.pickle','wb')
    pickle.dump(classifier,model)
    model.close()    

negative
Most Informative Features
                terrible = True           negati : positi =     12.1 : 1.0
               excellent = True           positi : negati =      8.8 : 1.0
                 perfect = True           positi : negati =      8.8 : 1.0
                horrible = True           negati : positi =      8.5 : 1.0
                   loved = True           positi : negati =      8.2 : 1.0
0.7272727272727273


In [12]:
# ##! check how many in fd

# print(len(fd))

# ##! how to see all of fd

# for word, frequency in fd.most_common():
#     print(f'{word}: {frequency}')

In [13]:
# print (list_words)

# 2. Application Menu

In [14]:
# Make a menu with 3 options

review = ""

while True:
    print("1. Enter a review")
    print("2. Analyze Your Review")
    print("3. Exit")
    
    ##! enable for debugging
    print("Curr stored input:" + review)
    
    choice = input("Enter your choice: ")

    
    if choice == '1':
        review = input("Enter your review: ")
        
        print("Sentence input successful")
    elif choice == '2':
        # If no review has been entered, return to the menu
        if review == "":
            print("No review has been entered. Please enter a review first.")
            continue
        # Preprocess the review
        else:
            # Make a copy of the review for analysis
            review_step2 = review
            
            # Remove unwanted characters
            review_step2 = word_tokenize(review_step2)
            # Remove symbols & numbers
            review_step2 = [word for word in review_step2 if word.isalpha()]
            review_step2 = [word for word in review_step2 if not word.isdigit()]
            
            # Display POS tagging
            from nltk.tag import pos_tag
            tagged_review = pos_tag(review_step2)
            print("Tagged review:", tagged_review)
        
            # Display synonyms and antonyms for each word
            from nltk.corpus import wordnet
            
            for word in review_step2:
                synonyms = []
                antonyms = []
                for syn in wordnet.synsets(word):
                    for lemma in syn.lemmas():
                        synonyms.append(lemma.name())
                        if lemma.antonyms():
                            antonyms.append(lemma.antonyms()[0].name())
                print(word + ":")
                print("Synonyms:", set(synonyms) if synonyms else "-")
                print("Antonyms:", set(antonyms) if antonyms else "-")
            
            # Do sentiment analysis
            review_step2 = clean_text(review_step2)
            review_step2 = lemmatize(review_step2)
            
            result = classifier.classify(FreqDist(review_step2))
            print("The review is marked as", result)
            
    elif choice == '3':
        print("Exiting program...")
        break
    else:
        print("Invalid choice. Please try again.")

1. Enter a review
2. Analyze Your Review
3. Exit
Curr stored input:
Sentence input successful
1. Enter a review
2. Analyze Your Review
3. Exit
Curr stored input:you hate to see it burn
Tagged review: [('you', 'PRP'), ('hate', 'VBP'), ('to', 'TO'), ('see', 'VB'), ('it', 'PRP'), ('burn', 'VB')]
you:
Synonyms: -
Antonyms: -
hate:
Synonyms: {'hate', 'detest', 'hatred'}
Antonyms: {'love'}
to:
Synonyms: -
Antonyms: -
see:
Synonyms: {'insure', 'image', 'watch', 'construe', 'attend', 'go_steady', 'escort', 'date', 'realise', 'learn', 'view', 'get_word', 'regard', 'catch', 'determine', 'go_through', 'consider', 'get_a_line', 'take_in', 'visualize', 'visualise', 'pick_up', 'go_out', 'come_across', 'project', 'get_wind', 'see', 'find_out', 'understand', 'discover', 'interpret', 'control', 'ensure', 'check', 'visit', 'encounter', 'run_into', 'reckon', 'picture', 'realize', 'look', 'experience', 'meet', 'examine', 'figure', 'hear', 'ascertain', 'envision', 'see_to_it', 'take_care', 'witness', 'find