# Tutorial Class 3

This tutorial comprises of the following sub-tasks:

#### 1. Using Naive Bayes to classify names as male and female
#### 2.Exploring the tweet tokenizer
#### 3. Exploring COCA

In [5]:
import nltk
from nltk.corpus import names

In [12]:
import random

In [9]:
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\dhing\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\names.zip.


True

## Task 1

Classifying names as male or female using NaiveBayes Classifier. To do so we look at the last letter of the name

In [3]:
#Accepts a string and returns the last letter
def gender_features(word):
    return {'last_letter': word[-1]}

In [4]:
gender_features('Winston')

{'last_letter': 'n'}

In [10]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')] )

In [17]:
labeled_names[:10]

[('Annaliese', 'female'),
 ('Diana', 'female'),
 ('Kynthia', 'female'),
 ('Eddy', 'male'),
 ('Brody', 'male'),
 ('Christin', 'female'),
 ('Valeria', 'female'),
 ('Tabina', 'female'),
 ('Marnie', 'female'),
 ('Ailyn', 'female')]

In [27]:
random.shuffle(labeled_names)

In [28]:
labeled_names[:10]

[('Izak', 'male'),
 ('Jamima', 'female'),
 ('Meriel', 'female'),
 ('Mickey', 'male'),
 ('Susann', 'female'),
 ('Ikey', 'female'),
 ('Temp', 'male'),
 ('Rory', 'female'),
 ('Michael', 'male'),
 ('Phedra', 'female')]

In [29]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

In [30]:
featuresets[:10]

[({'last_letter': 'k'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'l'}, 'female'),
 ({'last_letter': 'y'}, 'male'),
 ({'last_letter': 'n'}, 'female'),
 ({'last_letter': 'y'}, 'female'),
 ({'last_letter': 'p'}, 'male'),
 ({'last_letter': 'y'}, 'female'),
 ({'last_letter': 'l'}, 'male'),
 ({'last_letter': 'a'}, 'female')]

In [31]:
len(featuresets)

7944

In [32]:
train_set, test_test = featuresets[500:], featuresets[:500]

In [33]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [34]:
classifier.classify(gender_features('David'))

'male'

In [35]:
classifier.classify(gender_features('Brinda'))

'female'

In [36]:
nltk.classify.accuracy(classifier, test_test)

0.77

In [37]:
#Problem statement defines if you should retain emojis or not

## Task 2

Exploring the tweet tokenizer

In [38]:
from nltk.tokenize import TweetTokenizer

In [41]:
text = "The party was sooo fun :D #superfun"
text2 = "The party was sooo fun :D #superfun😍"
twttkn = TweetTokenizer()
twttkn.tokenize(text)

['The', 'party', 'was', 'sooo', 'fun', ':D', '#superfun']

In [42]:
twttkn.tokenize(text2)

['The', 'party', 'was', 'sooo', 'fun', ':D', '#superfun', '😍']

In [44]:
#Concordancy search: Whatever word you look for you also get the context(20 words before & after)
#Propositional attachment problem

## Task 3

Exploring COCA: The Corpus of Contemporary American English

Let us search for the word 'OBAMA'

![image.png](attachment:image.png)

##### Concordancy search

![image.png](attachment:image.png)

Distribution of appearence

![image.png](attachment:image.png)