# Tutorial Class 3

This tutorial comprises of the following sub-tasks:

#### 1. Using Naive Bayes to classify names as male and female
#### 2.Exploring the tweet tokenizer
#### 3. Exploring COCA

In [1]:
import nltk
from nltk.corpus import names

In [2]:
import random

In [9]:
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\dhing\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\names.zip.


True

## Task 1

Classifying names as male or female using NaiveBayes Classifier. To do so we look at the last letter of the name

In [3]:
#Accepts a string and returns the last letter
def gender_features(word):
    return {'last_letter': word[-1]}

In [4]:
gender_features('Winston')

{'last_letter': 'n'}

In [5]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')] )

In [6]:
labeled_names[:10]

[('Aamir', 'male'),
 ('Aaron', 'male'),
 ('Abbey', 'male'),
 ('Abbie', 'male'),
 ('Abbot', 'male'),
 ('Abbott', 'male'),
 ('Abby', 'male'),
 ('Abdel', 'male'),
 ('Abdul', 'male'),
 ('Abdulkarim', 'male')]

In [7]:
random.shuffle(labeled_names)

In [8]:
labeled_names[:10]

[('Dotty', 'female'),
 ('Hadrian', 'male'),
 ('Dane', 'male'),
 ('Yvonne', 'female'),
 ('Israel', 'male'),
 ('Elisabet', 'female'),
 ('Hyman', 'male'),
 ('Peyton', 'male'),
 ('Tonnie', 'male'),
 ('Sting', 'male')]

In [9]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

In [10]:
featuresets[:10]

[({'last_letter': 'y'}, 'female'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'e'}, 'male'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'l'}, 'male'),
 ({'last_letter': 't'}, 'female'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'e'}, 'male'),
 ({'last_letter': 'g'}, 'male')]

In [11]:
len(featuresets)

7944

In [12]:
train_set, test_test = featuresets[500:], featuresets[:500]

In [13]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [14]:
classifier.classify(gender_features('David'))

'male'

In [15]:
classifier.classify(gender_features('Brinda'))

'female'

In [16]:
nltk.classify.accuracy(classifier, test_test)

0.748

In [17]:
#Problem statement defines if you should retain emojis or not

## Task 2

Exploring the tweet tokenizer

In [18]:
from nltk.tokenize import TweetTokenizer

In [19]:
text = "The party was sooo fun :D #superfun"
text2 = "The party was sooo fun :D #superfun😍"
twttkn = TweetTokenizer()
twttkn.tokenize(text)

['The', 'party', 'was', 'sooo', 'fun', ':D', '#superfun']

In [20]:
twttkn.tokenize(text2)

['The', 'party', 'was', 'sooo', 'fun', ':D', '#superfun', '😍']

In [21]:
#Concordancy search: Whatever word you look for you also get the context(20 words before & after)
#Propositional attachment problem

## Task 3

Exploring COCA: The Corpus of Contemporary American English

### 3.1 Exploring COCA

Let us search for the word 'OBAMA'

![image.png](attachment:image.png)

##### Concordancy search

![image.png](attachment:image.png)

### 3.2 Finding the frequency count of the word

![image.png](attachment:image.png)

### 3.3 Chart: Word frequency, section, subsection, year

Distribution of appearence

![image.png](attachment:image.png)

### 3.4 Collocate(Display three words on the left and right)

![image.png](attachment:image.png)

![image.png](attachment:image.png)