In [2]:
import nltk

In [3]:
def trainModel():
    # Read csv
    import pandas as pd
    
    dataset = pd.read_csv("./dataset.csv")
    
    feature_set = [(cleanText(text),label) for text,label in zip(dataset["tweet"],dataset["sentiment"])]
    
    from random import shuffle
    
    shuffle(feature_set)
    
    split_idx = int(len(feature_set)*0.85)
    train_set = feature_set[:split_idx]
    test_set = feature_set[split_idx:]
    
    # make model w/ naive bayes
    from nltk.classify import NaiveBayesClassifier
    classifier = NaiveBayesClassifier.train(train_set)
    
    # Show 5 most informative features and training accuracy
    print("5 most informative features:")
    classifier.show_most_informative_features(5)
    
    
    from nltk.classify import accuracy
    accu = accuracy(classifier,test_set)
    print("Accuracy: ", accu)
    
    #save model
    import pickle
    file = open("model.pickle","wb")
    pickle.dump(classifier,file)
    file.close()
    
    
def cleanText(text): # preprocessing
    
    #tokenize
    from nltk.tokenize import word_tokenize
    txt = word_tokenize(text.lower())
    
    #remove stopwords
    from nltk.corpus import stopwords
    stopWords_en = stopwords.words("english")
    txt = [word for word in txt if not word in stopWords_en]
    
    
    #remove symbols &num
    txt = [word for word in txt if word.isalpha()]
    
    
    #stem & lemmatize
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    stemmer = PorterStemmer()
    lemmer = WordNetLemmatizer()
    
    txt = [stemmer.stem(word) for word in txt]      # stem
    txt = [lemmer.lemmatize(word) for word in txt]  # lemmatize
    
    
    # remove punctuation (?)
    from string import punctuation
    txt = [word for word in txt if not word in punctuation]
    
    # return a dict
    return {word: True for word in txt}


# print(cleanText("I love to hide in the bushes at night, but in 2001 i got caught by my neighbors"))

trainModel()

5 most informative features:
Most Informative Features
                    love = True           positi : negati =     13.3 : 1.0
                   delay = True           negati : positi =     12.7 : 1.0
                   thank = True           positi : negati =      9.9 : 1.0
                    amaz = True           positi : negati =      8.8 : 1.0
                  number = True           negati : positi =      8.6 : 1.0
Accuracy:  0.8571428571428571


In [7]:
# First, the application will check whether there is “model.pickle” file in the application
# directory or not.
def readModel():
    try:
        file = open("model.pickle","rb")
        print("Model exists!")
        # If the file exists, then the application will read and load the data training from the file.
        
        print("Loading model..")
        import pickle
        classifier = pickle.load(file)
        file.close()
        
        print("Model info:",end = "")
        classifier.show_most_informative_features(5)
        
        
    except:
        print("no model, making new one!")
        # train new model with naive bayes using dataset.csv
        
        classifier = trainModel()

        
        print("Training complete")
        
    return classifier

In [20]:
def enterRev():
    while True:
        txt = input("Enter sentence (>=5 words)")
    
        words = txt.split()
        print(words)

        if len(words) >4:
            print("review added")
            return txt
        else:
            print("invalid")
    
enterRev()

[]
invalid
['a', 'a', 'a', 'a', 'a']
review added


'a a a a a '

In [49]:
def analyzeRev(review,classifier):
    # Check whether there is a tweet or not
    import pandas as pd
    
    dataset = pd.read_csv("./dataset.csv")
    
    # check if rev in dataset
    if review in dataset["tweet"].values:
        print("Review is found.")
        
        #semi clean text
        from nltk.tokenize import word_tokenize
        from nltk.probability import  FreqDist
        from string import punctuation
        
        words = word_tokenize(review.lower())
        words = FreqDist([word for word in words if word.isalpha() and word not in punctuation])
        
        
        # do POS tagging
        from nltk.tag import pos_tag
        
        tags = pos_tag(words)
        for i, word in enumerate(tags):
            print(f"{i+1}. {word[0]} : {word[1]}")
        
        
        # show synonym & antonym
        from nltk.corpus import wordnet
        
        for word in words:
            print("="* 10)
            print(f"{word} =")
            print("="* 10)
            
            # synsets
            Synsets = wordnet.synsets(word)
            Syn = []
            Ant = []
            
            for synset in Synsets:
                for lemma in synset.lemmas():
                    Syn.append(lemma.name())
                    for anto in lemma.antonyms():
                        Ant.append(anto.name())
            
            print("Synonyms: ")
            if len(Syn) == 0:
                print("no synos")
            else:
                for synos in Syn[:5]:
                    print(f"(+){synos}",end="  ")
                print("")
            
            
            print("Antonyms: ")
            if len(Ant) == 0:
                print("no Anto")
            else:
                for antos in Ant[:5]:
                    print(f"(-){antos}",end="  ")
                print("")
            
            
        # Predict and show the result of the tweet category.
        from nltk.corpus import stopwords
        stopWords_en = stopwords.words("english")
        clean_rev = [word for word in word_tokenize(review) if word not in punctuation and word not in stopWords_en]
        result = classifier.classify(FreqDist(clean_rev))
        
        print(f"Your Review: {review}")
        print(f"Review Category: {result}")
        
    else:
        print("Review is not in the dataset, going back...")

In [53]:
if __name__ == "__main__":
    classifier = readModel()
    
    review = ""
    while True:
        print("===Title===")
        print("Curr rev =", review if review else "No reviews")
        print("1. write  rev")
        print("2. analyze rev")
        print("3. exit")
        
        choice = int(input("Enter input [1-3]"))
        if choice == 1:
            review = enterRev()
        elif choice == 2:
            analyzeRev(review,classifier)
        elif choice == 3:
            print("exiting..")
            break
        else:
            print("wrong input")

Model exists!
Loading model..
Model info:Most Informative Features
                    love = True           positi : negati =     14.6 : 1.0
                    call = True           negati : positi =     14.0 : 1.0
                    amaz = True           positi : negati =     10.5 : 1.0
                   thank = True           positi : negati =     10.0 : 1.0
                    site = True           negati : positi =      7.5 : 1.0
===Title===
Curr rev = No reviews
1. write  rev
2. analyze rev
3. exit
['@VirginAmerica', 'your', 'website', 'sucks', 'donkey', 'dicks.', 'Just', 'thought', 'you', 'should', 'know.', 'All', 'best.']
review added
===Title===
Curr rev = @VirginAmerica your website sucks donkey dicks. Just thought you should know. All best.
1. write  rev
2. analyze rev
3. exit
Review is found.
1. virginamerica : VB
2. your : PRP$
3. website : NN
4. sucks : NNS
5. donkey : JJ
6. dicks : NNS
7. just : RB
8. thought : VBN
9. you : PRP
10. should : MD
11. know : VB
12. all : 