## Simple Text Classification

In [1]:
import nltk

In [2]:
def gender_features(word):
    return {'last_letter':word[-1]}

In [3]:
gender_features('Chandler')

{'last_letter': 'r'}

In [4]:
from nltk.corpus import names
len(names.words())

7944

In [7]:
type(names)

nltk.corpus.reader.wordlist.WordListCorpusReader

In [9]:
names.words()

['Abagael',
 'Abagail',
 'Abbe',
 'Abbey',
 'Abbi',
 'Abbie',
 'Abby',
 'Abigael',
 'Abigail',
 'Abigale',
 'Abra',
 'Acacia',
 'Ada',
 'Adah',
 'Adaline',
 'Adara',
 'Addie',
 'Addis',
 'Adel',
 'Adela',
 'Adelaide',
 'Adele',
 'Adelice',
 'Adelina',
 'Adelind',
 'Adeline',
 'Adella',
 'Adelle',
 'Adena',
 'Adey',
 'Adi',
 'Adiana',
 'Adina',
 'Adora',
 'Adore',
 'Adoree',
 'Adorne',
 'Adrea',
 'Adria',
 'Adriaens',
 'Adrian',
 'Adriana',
 'Adriane',
 'Adrianna',
 'Adrianne',
 'Adrien',
 'Adriena',
 'Adrienne',
 'Aeriel',
 'Aeriela',
 'Aeriell',
 'Ag',
 'Agace',
 'Agata',
 'Agatha',
 'Agathe',
 'Aggi',
 'Aggie',
 'Aggy',
 'Agna',
 'Agnella',
 'Agnes',
 'Agnese',
 'Agnesse',
 'Agneta',
 'Agnola',
 'Agretha',
 'Aida',
 'Aidan',
 'Aigneis',
 'Aila',
 'Aile',
 'Ailee',
 'Aileen',
 'Ailene',
 'Ailey',
 'Aili',
 'Ailina',
 'Ailyn',
 'Aime',
 'Aimee',
 'Aimil',
 'Aina',
 'Aindrea',
 'Ainslee',
 'Ainsley',
 'Ainslie',
 'Ajay',
 'Alaine',
 'Alameda',
 'Alana',
 'Alanah',
 'Alane',
 'Alanna',
 

In [10]:
labelled_names = [(name, "male") for name in names.words("male.txt")] + [(name, "female") for name in names.words("female.txt")]

In [11]:
import random
random.shuffle(labelled_names)

In [12]:
labelled_names

[('Berta', 'female'),
 ('Benito', 'male'),
 ('Tobi', 'female'),
 ('Darsie', 'female'),
 ('Solomon', 'male'),
 ('Sumner', 'male'),
 ('Helaine', 'female'),
 ('Maud', 'female'),
 ('Carena', 'female'),
 ('Dasya', 'female'),
 ('Storey', 'female'),
 ('Conroy', 'male'),
 ('Ransell', 'male'),
 ('Teddy', 'male'),
 ('Maritsa', 'female'),
 ('Silvano', 'male'),
 ('Murielle', 'female'),
 ('Delia', 'female'),
 ('Gigi', 'female'),
 ('Madona', 'female'),
 ('Clarey', 'female'),
 ('Dacy', 'female'),
 ('Shimon', 'male'),
 ('Silvester', 'male'),
 ('John-David', 'male'),
 ('Nannie', 'female'),
 ('Burgess', 'male'),
 ('Dov', 'male'),
 ('Giacomo', 'male'),
 ('Geoff', 'male'),
 ('Lindie', 'female'),
 ('Aggi', 'female'),
 ('Vanny', 'female'),
 ('Arvin', 'male'),
 ('Sascha', 'female'),
 ('Alvin', 'male'),
 ('Eleanora', 'female'),
 ('Stig', 'male'),
 ('Clarette', 'female'),
 ('Britt', 'female'),
 ('Lina', 'female'),
 ('Vin', 'female'),
 ('Dalenna', 'female'),
 ('Staci', 'female'),
 ('Desaree', 'female'),
 ('Elfi

In [21]:
featuresset = [(gender_features(name), gender) for (name, gender) in labelled_names]
train_set, test_set = featuresset[:6000], featuresset[6001:]

In [22]:
clf = nltk.NaiveBayesClassifier.train(train_set)

In [23]:
clf.classify(gender_features("Anurag"))

'male'

In [25]:
clf.classify(gender_features("Monica"))

'female'

In [26]:
clf.classify(gender_features("Rachel"))

'male'

In [27]:
nltk.classify.accuracy(clf, test_set)

0.7601646937725167

## Count Vectorizer

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
vect = CountVectorizer(binary=True)
corpus = ["Tessaract is good optical character recognition engine  ", "optical character recognition is significant "]

In [30]:
vect.fit(corpus)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [31]:
vocab = vect.vocabulary_

In [33]:
for key in sorted(vocab.keys()):
    print(key," : ",vocab[key])

character  :  0
engine  :  1
good  :  2
is  :  3
optical  :  4
recognition  :  5
significant  :  6
tessaract  :  7


In [34]:
vect.transform(["This is a good optical illusion"]).toarray()

array([[0, 0, 1, 1, 1, 0, 0, 0]], dtype=int64)

In [38]:
# Finding Similarity Between Documents
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vect.transform([corpus[0]]).toarray(), vect.transform([corpus[1]]).toarray())
print(similarity)

[[0.6761234]]


In [39]:
corpus[0]

'Tessaract is good optical character recognition engine  '

In [40]:
corpus[1]

'optical character recognition is significant '