In [1]:
import nltk, pandas as pd, pickle
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier,accuracy
from nltk.corpus import stopwords, wordnet
from string import punctuation
from random import shuffle

In [2]:
stems = PorterStemmer()
lemms = WordNetLemmatizer()
stopW = stopwords.words("english")

In [3]:
def preproc(words):
    tokens = word_tokenize(words)
    words = [word.lower() for word in tokens if word.isalpha()]
    words = [word for word in words if word not in stopW]
    words = [stems.stem(word) for word in words]
    words = [lemms.lemmatize(word) for word in words]
    
    return words

In [4]:
ds = pd.read_csv("./dataset.csv",encoding= "utf-8-sig")

all_w = []
for sent in ds["tweet"]:
    for word in preproc(sent):
        all_w.append(word)

In [5]:
fd = FreqDist(all_w)
commonW = {word for  word in fd.most_common(1000)}

In [6]:
feature_set = []

for lbl,cat in zip(ds["tweet"],ds["sentiment"]):
    feature = {}
    
    procd = preproc(lbl)
    
    for word in procd:
        feature[word] = (word in commonW)
        
    feature_set.append((feature,cat))

In [7]:
shuffle(feature_set)

split_idx = int(len(feature_set)*0.7)
trains = feature_set[:split_idx]
tests = feature_set[split_idx:]

In [8]:
clf = NaiveBayesClassifier.train(trains)
clf.show_most_informative_features(5)

Most Informative Features
                    hour = False          negati : positi =     11.5 : 1.0
                    love = False          positi : negati =     11.2 : 1.0
                   thank = False          positi : negati =     10.1 : 1.0
                  luggag = False          negati : positi =      8.7 : 1.0
                   still = False          negati : positi =      8.7 : 1.0


In [17]:
print(accuracy(clf,tests))

0.8380952380952381


In [9]:
file = open("./modeltesst.pickle","wb")
pickle.dump(clf,file)
file.close()

In [10]:
def synoanto(words):
    SYN = []
    ANT = []
    
    for syns in wordnet.synsets(words):
        for lemmas in syns.lemmas():
            SYN.append(lemmas.name())
            for ants in lemmas.antonyms():
                ANT.append(ants.name())

    return SYN,ANT

In [11]:
rev = "you are late"

tok_rev = word_tokenize(rev)

for word in tok_rev:
    syn,ant = synoanto(word)
    
    print(word)
    print(f"{", ".join(syn) if syn else "-"}")
    print(f"{", ".join(ant) if ant else "-"}")
    print("="*10)

you
-
-
are
are, ar, be, be, be, exist, be, be, equal, be, constitute, represent, make_up, comprise, be, be, follow, embody, be, personify, be, be, live, be, cost, be
differ
late
late, belated, late, tardy, late, recent, late, late, late, later, former, late, previous, late, belatedly, tardily, deep, late, late, recently, late, lately, of_late, latterly
early, middle, early, middle, early, early


In [12]:
def tagger(word):
    tokens = word_tokenize(word)
    tags = pos_tag(tokens)
    return tags

In [13]:
rev = "I will come in the evening"

tag_rev = tagger(rev)

for i,word in enumerate(tag_rev):
    print(f"{i+1}. {word[0]} --> {word[1]}")

1. I --> PRP
2. will --> MD
3. come --> VB
4. in --> IN
5. the --> DT
6. evening --> NN


In [14]:
def analyze(words):
    file = open("./modeltesst.pickle","rb")
    classer = pickle.load(file)
    file.close()
    
    clean = preproc(words)
    
    res = classer.classify(FreqDist(clean))
    print(res)
    
rev = "The plane is late and the flight attendant was rude. It's terrible"
analyze(rev)

negative


In [16]:
def inputTxt():
    rev = input("Enter word")
    
    if(len(rev.split())>=5):
        print("valid")
        return rev
    else:
        print("invalid")
        return rev
    
inputs = inputTxt()

valid
