'Bag of Words' Classifier: an example of Classical NLP

In [None]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from itertools import chain
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
adjectives  = ['cruel', 'strong', 'weak', 'smart', 'happy', 'cowardly', 'brave', 'stupid', 'heroic', 'loving']
nouns =['professor', 'doctor', 'politician', 'fireman', 'policeman', 'scientist', 'lawyer', 'physicist', 'nurse', 'mathematician']
verbs = ['wins', 'loses', 'lies','saves', 'flees', 'kills', 'saves', 'cries', 'innovates', 'creates', 'believes']


positive = ['wins', 'saves','innovates', 'creates', 'believes' ]
negatives = ['loses', 'lies', 'flees', 'kills','cries']

In [None]:
sentences = []
labels = []

for a in adjectives:
  for n in nouns:
    for v in verbs:
        sentences.append(a + ' ' + n + ' '+ v)
        if v in positive:
          labels.append('pos')
        else:
          labels.append('neg')


In [None]:
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.1, random_state=42)

In [None]:
train = []
test = []

for i in range(len(X_train)):
  train.append([X_train[i], y_train[i]]) 


for i in range(len(X_test)):
  test.append([X_test[i], y_test[i]]) 

In [None]:
from nltk.tokenize import word_tokenize # or use some other tokenizer
all_words = set(word.lower() for passage in train for word in word_tokenize(passage[0]))
t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]

In [None]:
classifier = nltk.NaiveBayesClassifier.train(t)

In [None]:
vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in train]))

In [None]:
all_words2 = set(word.lower() for passage in test for word in word_tokenize(passage[0]))
t2 = [({word: (word in word_tokenize(x[0])) for word in all_words2}, x[1]) for x in test]

In [None]:
print("Classifier accuracy:",(nltk.classify.accuracy(classifier, t2))*100)

Classifier accuracy: 100.0


In [None]:
classifier.show_most_informative_features(5)

Most Informative Features
                   saves = False             neg : pos    =      1.5 : 1.0
                   cries = False             pos : neg    =      1.3 : 1.0
                   kills = False             pos : neg    =      1.3 : 1.0
                   loses = False             pos : neg    =      1.3 : 1.0
                   flees = False             pos : neg    =      1.2 : 1.0


In [None]:
for i in range(10):
  test_sentence = X_test[i]
  featurized_test_sentence = {i:(i in word_tokenize(test_sentence.lower())) for i in vocabulary}
  pred = classifier.classify(featurized_test_sentence)
  print('Sentence: '  + test_sentence)
  print('Correct label: ' + y_test[i])
  print('Predicted label: ' + pred)
  print(' ')


Sentence: weak mathematician creates
Correct label: pos
Predicted label: pos
 
Sentence: brave politician saves
Correct label: pos
Predicted label: pos
 
Sentence: smart physicist saves
Correct label: pos
Predicted label: pos
 
Sentence: stupid doctor cries
Correct label: neg
Predicted label: neg
 
Sentence: weak politician lies
Correct label: neg
Predicted label: neg
 
Sentence: brave mathematician wins
Correct label: pos
Predicted label: pos
 
Sentence: cruel nurse innovates
Correct label: pos
Predicted label: pos
 
Sentence: cowardly scientist wins
Correct label: pos
Predicted label: pos
 
Sentence: brave lawyer innovates
Correct label: pos
Predicted label: pos
 
Sentence: cruel mathematician lies
Correct label: neg
Predicted label: neg
 
