In [14]:
from nltk.tokenize import sent_tokenize, word_tokenize
import string

In [2]:
SAMPLE_TEXT = "Does this thing really work? I guess it should. Lets see"

In [4]:
sent_tokenize(SAMPLE_TEXT)

['Does this thing really work?', 'I guess it should.', 'Lets see']

In [12]:
words = word_tokenize(SAMPLE_TEXT)

In [6]:
from nltk.corpus import stopwords

In [120]:
stop = stopwords.words('english')

In [121]:
punctuations = list(string.punctuation)
stop = stop + punctuations

In [27]:
clean_words = [w for w in words if not w.lower() in stop]

In [28]:
clean_words

['thing', 'really', 'work', 'guess', 'Lets', 'see']

In [35]:
stem_words = ["play", "player", "played", "playing", "happier"]
from nltk.stem import PorterStemmer
ps = PorterStemmer()
stemmed = [ps.stem(w) for w in stem_words]
stemmed

['play', 'player', 'play', 'play', 'happier']

In [36]:
ps.stem("painting")

'paint'

In [37]:
from nltk.corpus import state_union

In [38]:
text = state_union.raw("2006-GWBush.txt")

In [43]:
speech_words = word_tokenize(text.lower())

In [41]:
from nltk import pos_tag

In [45]:
pos = pos_tag(speech_words)
pos[0:10]

[('president', 'NN'),
 ('george', 'NN'),
 ('w.', 'VBD'),
 ('bush', 'NN'),
 ("'s", 'POS'),
 ('address', 'NN'),
 ('before', 'IN'),
 ('a', 'DT'),
 ('joint', 'JJ'),
 ('session', 'NN')]

POS tag list:

CC	coordinating conjunction
CD	cardinal digit
DT	determiner
EX	existential there (like: "there is" ... think of it like "there exists")
FW	foreign word
IN	preposition/subordinating conjunction
JJ	adjective	'big'
JJR	adjective, comparative	'bigger'
JJS	adjective, superlative	'biggest'
LS	list marker	1)
MD	modal	could, will
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
PDT	predeterminer	'all the kids'
POS	possessive ending	parent's
PRP	personal pronoun	I, he, she
PRP$	possessive pronoun	my, his, hers
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
RP	particle	give up
TO	to	go 'to' the store.
UH	interjection	errrrrrrrm
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when

In [47]:
from nltk.stem import WordNetLemmatizer

In [48]:
lem = WordNetLemmatizer()

In [53]:
lem.lemmatize("painting", pos='n')

'painting'

In [51]:
lem.lemmatize("painting", pos='v')

'paint'

In [54]:
from sklearn import datasets

In [122]:
news = datasets.fetch_20newsgroups()

In [123]:
data = news.data
target = news.target

In [97]:
a = [1,2,3,4,5]
b = [1,2,3,4,5]

In [98]:
import random
random.seed(0)
random.shuffle(a)
a

[3, 2, 1, 5, 4]

In [99]:
random.seed(0)
random.shuffle(b)
b

[3, 2, 1, 5, 4]

In [82]:
# import random
# random.seed(0)
# random.shuffle(data)

In [85]:
# random.seed(0)
# random.shuffle(target)

In [124]:
documents = []
for i in range(len(data)):
    documents.append((data[i], target[i]))
random.shuffle(documents)

In [125]:
train = documents[:9000]
test = documents[9000:]

In [126]:
all_words = []
for doc, target in train:
    words = word_tokenize(doc)
    for w in words:
        if w.lower() not in stop:
            all_words.append(w.lower())

In [73]:
#word_tokenize(news.data[0])

In [127]:
len(all_words)

1838395

In [128]:
import nltk
dist = nltk.FreqDist(all_words)

In [129]:
vocab_with_frequency = dist.most_common(3000)
vocab = [i[0] for i in vocab_with_frequency]

In [130]:
def get_features(document):
    words = word_tokenize(document)
    features = {}
    for w in vocab:
        features[w] = (w in words)
    return features

In [131]:
training_data = [(get_features(d), t) for d, t in train]

In [132]:
testing_data = [(get_features(d), t) for d, t in test]

In [133]:
clf = nltk.NaiveBayesClassifier.train(training_data)

In [134]:
set(news.target)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}

In [135]:
clf.show_most_informative_features(10)

Most Informative Features
                     car = True                7 : 5      =    160.2 : 1.0
              encryption = True               11 : 12     =    141.1 : 1.0
                    bike = True                8 : 9      =    130.1 : 1.0
                    team = True               10 : 7      =    129.4 : 1.0
                    sale = True                6 : 15     =    124.7 : 1.0
                    game = True               10 : 13     =    119.8 : 1.0
                     gun = True               16 : 9      =    117.9 : 1.0
                    chip = True               11 : 8      =    111.5 : 1.0
              government = True               11 : 13     =    103.2 : 1.0
                  hockey = True               10 : 9      =    100.2 : 1.0


In [139]:
nltk.classify.accuracy(clf, testing_data[0:1000])

0.621

In [137]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

In [138]:
sklearn_svc = SklearnClassifier(SVC())
sklearn_svc.train(training_data)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))>

In [140]:
nltk.classify.accuracy(sklearn_svc, testing_data)

0.19317199654278305

In [141]:
import pickle

In [142]:
file = open("naivebayes.pickle", "wb")
pickle.dump(clf, file)

In [143]:
classifier_file = open("naivebayes.pickle", "rb")
classifier_copy = pickle.load(classifier_file)

In [177]:
train_set = ["The sky sky is blue.", "The sun is bright",  "The sun is blue what!"]

In [178]:
type(train_set)

list

In [154]:
from sklearn.feature_extraction.text import CountVectorizer

In [179]:
count_vec = CountVectorizer(max_features = 3)
a = count_vec.fit_transform(train_set)
a.todense()

matrix([[1, 1, 1],
        [0, 1, 1],
        [1, 1, 1]], dtype=int64)

In [171]:
clf = SVC()
clf.fit(a ,[0,1,0])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [176]:
count_vec.get_feature_names()

['blue', 'is', 'the']