# Assignment 3: Classifying Gender of EastEnder Characters based on dialogue snippets
# Amaya Syed

In [1]:
import csv
import re
import os

import numpy as np
import pandas as pd
import random


from random import shuffle
from random import seed
from random import randrange
from collections import Counter

from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction import DictVectorizer

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2

import spacy
from spacy import displacy
import en_core_web_sm
named_entity = en_core_web_sm.load()

nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Amaya\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


The aim of this assignment is to correctly classify the gender of characters from the series Eastenders based on dialogue snippets provided by the BBC.

There are two files avaibles, a training set file with 10113 dialogue instances and a testing set file with 1124 instances. Ine each the data is arranged in three columns, as such:

" 'Someone had fun.', 'SEAN', 'male' "
" 'It's no problem, honestly. Go on, go and open the launderette.  Leave it with me.', 'SHIRLEY', 'female' "

Leaving out the name of the character and we will endeavour to classify gender correctly by pinpointing properties of the dialogue that might be relevant. 

First we load the two files and keep them in the variables **training_set** and **testing_set**. 

In [2]:
# load data from a file and append it to the rawData

def loadData(path, Text=None):
    
    rawData = []
    
    with open(path, encoding='utf8') as f: # open file
        
        reader = csv.reader(f, delimiter=',')
        
        for line in reader: # each line corresponds to a review and its associated features
            
            speech, character, gender = line 
            
            rawData.append((speech, character, gender)) # keep the triple for all reviews
            
        return rawData
                       

In [395]:
training_set = loadData('training.csv')

In [594]:
# checking first line training set.
print(training_set[0])

('Someone had fun.', 'SEAN', 'male')


In [396]:
testing_set = loadData('test.csv')

In [263]:
# # checking first line testing set.
print(testing_set[0])

('Kicked you out?', 'STACEY', 'female')


We'll now check if the data is balanced.

In [10]:
data = pd.read_csv("training.csv", delimiter = ",", header = None)
data.head()

Unnamed: 0,0,1,2
0,Someone had fun.,SEAN,male
1,"It's no problem, honestly. Go on, go and open ...",SHIRLEY,female
2,Last night was better than ever. What's all th...,MAX,male
3,Have you checked the answerphone? Any calls?,IAN,male
4,Oscar's asleep.,MAX,male


In [11]:
data_female = data[data[2] == 'female']

In [12]:
print(len(data_female))

5032


In [13]:
data_male = data[data[2] == 'male']

In [14]:
print(len(data_male))

5081


The number of female/male dialogue instance is roughly equal, meaning that the expected accuracy of a non-trained classifier should be 50.5%. 

We can also quickly check if male/females have similar average word counts and average word lengths. 

In [15]:
print((data_female[0].str.count(' ') + 1).mean())

10.260930325414254


In [16]:
print((data_male[0].str.count(' ') + 1).mean())

10.691623864085342


In [17]:
print(data_female[0].str.len().mean())

50.9580754641645


In [18]:
print(data_male[0].str.len().mean())

53.36566574476492


The values are similar, males with slightly higher (4/5 %) average word count and length of word, but which doesnÂ´t seem like it would be significative.

# Baseline accuracy

As a baseline we will run the data through a simple pipeline, with minimal preprocessing and feature engineering, using a simple linear SVM model. We will split the training set into a 80/20 training and dev set. We won't be running cross-validation because the dataset is large enough it doesn't seem necessary. 

In [397]:
# b) TEXT PREPROCESSING AND FEATURE VECTORIZATION        

# Input: a string of one  review
def preProcess(text):
    
    order = 1
    # Simple tokenisation
    text = re.sub(r"(\w)([.,;:!?'\"\)])", r"\1 \2", text) # add a space between word and the punctuation in between square bracket
    text = re.sub(r"([.,;:!?'\"\(])(\w)", r"\1 \2", text) # add a space between punctuation and word
     
    tokens = re.split(r"\s+", text) # divide the text into tokens using white space as the divider
    tokens = [t.lower() for t in tokens] # pass all words to lower case
    #tokens = ['<s>'] * (order-1) + tokens + ['</s>'] # add beginning to pad for order > 2 and ending sequence for each review. 
    
    return tokens

In [398]:
print(preProcess("hello this is the, ehh... presumably, a crying situations!"))

['hello', 'this', 'is', 'the', ',', 'ehh', '...', 'presumably', ',', 'a', 'crying', 'situations', '!']


In [399]:
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    ""    
    ""
    token_frequency = {} # creating a local dictionary for token frequency in the review 

    for token in tokens: # for each word in the review
    
        if token not in token_frequency.keys(): 
            token_frequency[token] = 1  # if the word is not within the dict we add it as key = 1
        else:
            token_frequency[token] += 1  # if its already in the dic, add one
            
        if token not in featureDict.keys(): # same for global dictionary
             featureDict[token] = 1
        else:
            featureDict[token] += 1
        
    return token_frequency

In [400]:
def splitData(training_set, percentage):
    
    # A method to split the data between trainData and testData 
    
    dataSamples = len(training_set) #lenght of the dataset
    halfOfData = int(len(training_set)/2)
    trainingSamples = int((percentage*dataSamples)/2) # size of training set based on percentage chosen by user. 
    
    for (speech, _, gender) in training_set[:trainingSamples] + training_set[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(speech)), gender))
        
    for (speech, _, gender) in training_set[trainingSamples:halfOfData] + training_set[halfOfData+trainingSamples:]:
        devData.append((toFeatureVector(preProcess(speech)), gender))
        
    return trainData, devData

In [401]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

In [402]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([ ('svc', LinearSVC(C = 0.1, max_iter=6000))])
    return SklearnClassifier(pipeline).train(trainData)

In [403]:
# loading data

trainData = []    
devData = [] 

trainData, devData = splitData(training_set, 0.8)

In [404]:
# double checking I'm not using the gender label here. 
print(list(map(lambda t: t[0], devData[0:3])))

[{'alright': 1, '.': 2, 'i': 1, "'": 1, 'll': 1, 'try': 1, 'and': 1, 'get': 1, 'the': 1, 'time': 1, 'off': 1, 'work': 1}, {'what': 1, 'are': 1, 'you': 1, 'on': 1, 'about': 1, '?': 1}, {'you': 1, 'didn': 1, "'": 1, 't': 1, 'need': 1, 'to': 1, 'wait': 1, '.': 1}]


In [405]:
# print the number of training samples and the number of features after the split
print(training_set[0], trainData[1], devData[0])
print(len(training_set), len(trainData), len(devData), len(featureDict))

('Someone had fun.', 'SEAN', 'male') ({'it': 2, "'": 1, 's': 1, 'no': 1, 'problem': 1, ',': 2, 'honestly': 1, '.': 3, 'go': 2, 'on': 1, 'and': 1, 'open': 1, 'the': 1, 'launderette': 1, 'leave': 1, 'with': 1, 'me': 1}, 'female') ({'alright': 1, '.': 2, 'i': 1, "'": 1, 'll': 1, 'try': 1, 'and': 1, 'get': 1, 'the': 1, 'time': 1, 'off': 1, 'work': 1}, 'male')
10113 8090 2023 5859


In [406]:
classifier = trainClassifier(trainData)

Training Classifier...


In [407]:
# testing on dev set
results = []

testTrue = [t[1] for t in devData]
testPred = predictLabels(devData, classifier)
precision, recall, fscore, _ = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
results.append((precision, recall, fscore)) # append results obtained for each training set 
results = (np.mean(np.array(results), axis = 0)) # average the cv results for precision, recall and fscore
   # print(cv_results)
    
print("Done training!")
print("Precision %f\nRecall: %f\nF Score:%f" % (results[0], results[:1], results[2]))   


Done training!
Precision 0.570568
Recall: 0.570568
F Score:0.569950


In [408]:
full_trainData = []
testData = []


for (speech, _, gender) in training_set:
    full_trainData.append((toFeatureVector(preProcess(speech)), gender))

for (speech, _, gender) in testing_set:
    testData.append((toFeatureVector(preProcess(speech)), gender))

In [409]:
classifier = trainClassifier(full_trainData)

Training Classifier...


In [410]:
results = []

testTrue = [t[1] for t in testData]
testPred = predictLabels(testData, classifier)
precision, recall, fscore, _ = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
results.append((precision, recall, fscore)) # append results obtained for each training set 
results = (np.mean(np.array(results), axis = 0)) # average the cv results for precision, recall and fscore
   # print(cv_results)
    
print("Done training!")
print("Precision %f\nRecall: %f\nF Score:%f" % (results[0], results[:1], results[2]))   

Done training!
Precision 0.600344
Recall: 0.600344
F Score:0.599151


##################################

# Preprocessing and feature engineering

Now we've established a baseline accuracy, we will implement different preprocessing and feature engineering strategies to seek to better this accuracy score. For now we won't touch the classifier itself. 

Pre-processing strategies we will implement:

- lower case
- removal of white space
- removal of stopwords
- lemmatization

Feature engineering strategies we will implement:

- POS tagging
- Named Entity Recognition
- Polarity

In [413]:
# Input: a string of one  review
def preProcess(text, tagger=None):
    
    # word tokenisation, including punctuation removal
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    
    # lowercasing
    tokens = [t.lower() for t in tokens]
    
    # stopword removal- benefits are it removes rare words, though bad for bigram relations
    stop = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop]
        
    # lemmatisation
    lemmatiser = WordNetLemmatizer()
    tokens = [lemmatiser.lemmatize(t) for t in tokens]
         
        
    tokens = [t for t in tokens if t] # ensure no empty space
    
    # if there is no tagger, we just return the unmodified sentence. 
    
    return tokens

In [414]:
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):

    featureVec = {}

    for w in tokens:
        try:
            featureVec[w] += 1.0/len(tokens)
        except KeyError:
            featureVec[w] = 1.0/len(tokens)
        try:
            featureDict[w] += 1.0/len(tokens)
        except KeyError:
            featureDict[w] = 1.0/len(tokens)
            
    tagged_tokens = nltk.pos_tag(tokens)
    
    pos_tags = [t[1] for t in tagged_tokens]
    tag_set = set(pos_tags)
    tag_dict = {k : pos_tags.count(k)/len(pos_tags) for k in tag_set}
    
    featureVec.update(tag_dict)
    
    return featureVec

To add Named Entity Recognition features to the feature vector, run the 2nd SplitData and the NamedEntity function. If not run the 1st SplitData

In [111]:

def NamedEntity(text):
    doc = named_entity(text)
    named_ents = [(X.text, X.label_) for X in doc.ents]
   
    ents_tags = [t[1] for t in named_ents]
    ents_set = set(ents_tags)
    ents_dict = {k : ents_tags.count(k)/len(ents_tags) for k in ents_set}
    
    return ents_dict

### 1 - SplitData function if not using NER

In [415]:
def splitData(training_set, percentage):
    
    
    dataSamples = len(training_set) #lenght of the dataset
    halfOfData = int(len(training_set)/2)
    trainingSamples = int((percentage*dataSamples)/2) # size of training set based on percentage chosen by user. 
    
    for (speech, _, gender) in training_set[:trainingSamples] + training_set[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(speech)), gender))
        
    for (speech, _, gender) in training_set[trainingSamples:halfOfData] + training_set[halfOfData+trainingSamples:]:
        devData.append((toFeatureVector(preProcess(speech)), gender))
        
    return trainData, devData

### 2 - SplitData function if using NER

In [112]:
def splitData(training_set, percentage):
    

    dataSamples = len(training_set) #lenght of the dataset
    halfOfData = int(len(training_set)/2)
    trainingSamples = int((percentage*dataSamples)/2) # size of training set based on percentage chosen by user. 
    
    for (speech, _, gender) in training_set[:trainingSamples] + training_set[halfOfData:halfOfData+trainingSamples]:
        mydict = toFeatureVector(preProcess(speech))
        mydict.update(NamedEntity(speech))
        trainData.append((mydict,gender))
        
    for (speech, _, gender) in training_set[trainingSamples:halfOfData] + training_set[halfOfData+trainingSamples:]:
        mydict = toFeatureVector(preProcess(speech))
        mydict.update(NamedEntity(speech))
        devData.append((mydict,gender))
        
    return trainData, devData

If using Polarity, run 3rd SplitData and Polarity function.

In [331]:
def Polarity(text):
    
    POL = {}
    sid = SentimentIntensityAnalyzer()
    score = sid.polarity_scores(text)['compound']
   
    POL = {'POL': score}
    
    return POL

### 3 - SplitData for use with Polarity

In [332]:
def splitData(training_set, percentage):
    
    # A method to split the data between trainData and testData 

    dataSamples = len(training_set) #lenght of the dataset
    halfOfData = int(len(training_set)/2)
    trainingSamples = int((percentage*dataSamples)/2) # size of training set based on percentage chosen by user. 
    
    for (speech, _, gender) in training_set[:trainingSamples] + training_set[halfOfData:halfOfData+trainingSamples]:
        mydict = toFeatureVector(preProcess(speech))
        mydict.update(Polarity(speech))
        trainData.append((mydict,gender))
        
    for (speech, _, gender) in training_set[trainingSamples:halfOfData] + training_set[halfOfData+trainingSamples:]:
        mydict = toFeatureVector(preProcess(speech))
        mydict.update(Polarity(speech))
        devData.append((mydict,gender))
        
    return trainData, devData

#############################

In [416]:
# loading reviews
# initialize global lists that will be appended to by the methods below
#rawData = []          # the filtered data from the dataset file (should be 21000 samples)
trainData = []        # the pre-processed training data as a percentage of the total dataset (currently 80%, or 16800 samples)
devData = []         # the pre-processed test data as a percentage of the total dataset (currently 20%, or 4200 samples)

#random.seed(4)
#shuffle(training_set)
trainData, devData = splitData(training_set, 0.8)

In [417]:
# We print the number of training samples and the number of features after the split
print(training_set[0], trainData[2], devData[0])
print(len(training_set), len(trainData), len(devData), len(featureDict))

('Someone had fun.', 'SEAN', 'male') ({'last': 0.16666666666666666, 'night': 0.16666666666666666, 'better': 0.16666666666666666, 'ever': 0.16666666666666666, 'anything': 0.16666666666666666, 'interesting': 0.16666666666666666, 'RB': 0.16666666666666666, 'RBR': 0.16666666666666666, 'JJ': 0.3333333333333333, 'NN': 0.3333333333333333}, 'male') ({'alright': 0.2, 'try': 0.2, 'get': 0.2, 'time': 0.2, 'work': 0.2, 'NN': 0.4, 'JJ': 0.2, 'VB': 0.4}, 'male')
10113 8090 2023 5428


In [418]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC(C = 0.1, max_iter=6500))])
    return SklearnClassifier(pipeline).train(trainData)

In [419]:
classifier = trainClassifier(trainData)

Training Classifier...


In [420]:
# testing on dev set
results = []

testTrue = [t[1] for t in devData]
testPred = predictLabels(devData, classifier)
precision, recall, fscore, _ = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
results.append((precision, recall, fscore)) # append results obtained for each training set 
results = (np.mean(np.array(results), axis = 0)) # average the cv results for precision, recall and fscore
   # print(cv_results)
    
print("Done training!")
print("Precision %f\nRecall: %f\nF Score:%f" % (results[0], results[:1], results[2]))   


Done training!
Precision 0.563617
Recall: 0.563617
F Score:0.563482


Finally we will add tf-idf to the training pipeline.

In [421]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('tfidf', TfidfTransformer()), ('svc', LinearSVC(C = 0.1, max_iter=4500))])
    return SklearnClassifier(pipeline).train(trainData)

In [422]:
classifier = trainClassifier(trainData)

Training Classifier...


In [423]:
# testing on dev set
results = []

testTrue = [t[1] for t in devData]
testPred = predictLabels(devData, classifier)
precision, recall, fscore, _ = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
results.append((precision, recall, fscore)) # append results obtained for each training set 
results = (np.mean(np.array(results), axis = 0)) # average the cv results for precision, recall and fscore
   # print(cv_results)
    
print("Done training!")
print("Precision %f\nRecall: %f\nF Score:%f" % (results[0], results[:1], results[2]))


Done training!
Precision 0.569441
Recall: 0.569441
F Score:0.569443


In the following cell we indicate the accuracy scores for different combinations of preprocess and features. The highest score here is Stop Word Removal, with POS tags and TD-IDF. The second highest is Stop Word Removal, with POS tags, Polarity and TD-IDF.

**Dev set accuracy scores**

Lemmatization, POS Tag:
    

- Precision 0.536866
- Recall: 0.536866
- F Score:0.535487

Lemmatization, NER:

- Precision 0.537645
- Recall: 0.537645
- F Score:0.536910

SWR, NER:


- Precision 0.551256
- Recall: 0.551256
- F Score:0.549940
    
SWR, POS tag:


- Precision 0.563617
- Recall: 0.563617
- F Score:0.563482


SWR, Lemmatization, POS tag:


- Precision 0.551689
- Recall: 0.551689
- F Score:0.551656

SWR, Lemmatization, NER:

    
- Precision 0.542277
- Recall: 0.542277
- F Score:0.541064

SWR, Lemmatization, POS tag, NER:


- Precision 0.545692
- Recall: 0.545692
- F Score:0.545637

SWR, Lemmatization, POS tag, NER, TF-IDF:


- Precision 0.553608
- Recall: 0.553608
- F Score:0.553507

SWR, Lemmatization, POS tag, TF-IDF:


- Precision 0.561530
- Recall: 0.561530
- F Score:0.561387

SWR, POS tag, Polarity, TF-IDF:

- Precision 0.566971
- Recall: 0.566971
- F Score:0.566973

SWR, POS tag, TF-IDF:


- Precision 0.569441
- Recall: 0.569441
- F Score:0.569443


######################################

# Classifiers

We will build a pipeline and perform gridsearch using several classifiers and parameters to examine how this might affect our results.  We'll pass our feature dictionary to matricial form for ease of use and check the following classifiers:

- SVC
- SGD
- Random Forest

In [146]:
X_train = [t[0] for t in trainData]
Y_train = [t[1] for t in trainData]

In [147]:
Dict_Vectorizer = DictVectorizer()
X_train_mat = Dict_Vectorizer.fit_transform(X_train)

### SVC

parameters = [{'model__kernel': ['rbf'], 'model__gamma': [1e-3, 1e-4],
                     'model__C': [0.1, 1, 10, 100]},
              {'model__kernel': ['linear'], 'model__C': [0.1, 1, 10, 100]}
                    ]
    
pipeline =  Pipeline([('tfidf', TfidfTransformer()), ('model', SVC())])
    

grid_search = GridSearchCV(pipeline, cv=5, param_grid= parameters, scoring= 'roc_auc')
    
grid_search.fit(X_train_mat, Y_train)


In [160]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
grid_search.best_params_

Best score: 0.590
Best parameters set:


{'model__C': 1, 'model__kernel': 'linear'}

### SGD

In [180]:
parameters = [
            {'model__alpha': (1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0),
            'model__penalty': ('l2', 'elasticnet', 'l1')}]

pipeline =  Pipeline([('tfidf', TfidfTransformer()), ('model', SGDClassifier())])
    
grid_search = GridSearchCV(pipeline, cv=10, param_grid= parameters, scoring= 'roc_auc')
    
grid_search.fit(X_train_mat, Y_train)


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('tfidf', TfidfTransformer()),
                                       ('model', SGDClassifier())]),
             param_grid=[{'model__alpha': (1e-07, 1e-06, 1e-05, 0.0001, 0.001,
                                           0.01, 0.1, 1.0),
                          'model__penalty': ('l2', 'elasticnet', 'l1')}],
             scoring='roc_auc')

In [181]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
grid_search.best_params_

Best score: 0.591
Best parameters set:


{'model__alpha': 0.0001, 'model__penalty': 'elasticnet'}

### Random Forest

In [225]:
random_grid = [{
 'model__max_depth': [2000],
 'model__min_samples_split': [100],
 'model__max_leaf_nodes': [None]
}]

pipeline = Pipeline([('tfidf',TfidfTransformer()), ('model', RandomForestClassifier())])

In [226]:
grid_search = GridSearchCV(pipeline, param_grid= random_grid, cv = 5, scoring= 'roc_auc')
grid_search.fit(X_train_mat, Y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfTransformer()),
                                       ('model', RandomForestClassifier())]),
             param_grid=[{'model__max_depth': [2000],
                          'model__max_leaf_nodes': [None],
                          'model__min_samples_split': [100]}],
             scoring='roc_auc')

In [227]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
grid_search.best_params_

Best score: 0.589
Best parameters set:


{'model__max_depth': 2000,
 'model__max_leaf_nodes': None,
 'model__min_samples_split': 100}

The results obtained using grid search and cross validation indicate that the choice between these classifiers does not change the accuracy outcome. Because of that we can use SGD which showed a fractionally higher accuracy score.

############################

# Testing the features and classifiers on the testing set

We will test the two highest scoring combination of pre-processing and features obtained on the development set on the full dataset and test on the testing set. We will use SGD as our classifier. 

- SWR, POS tag, Polarity, TF-IDF
- SWR, POS tag, TF-IDF

In [448]:
# Input: a string of one  review
def preProcess(text, tagger=None):
    
    # word tokenisation, including punctuation removal
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    
    # lowercasing
    tokens = [t.lower() for t in tokens]
    
    # stopword removal- benefits are it removes rare words, though bad for bigram relations
    stop = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop]
         
        
    tokens = [t for t in tokens if t] # ensure no empty space
    
    return tokens

In [449]:
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):

    featureVec = {}

    for w in tokens:
        try:
            featureVec[w] += 1.0/len(tokens)
        except KeyError:
            featureVec[w] = 1.0/len(tokens)
        try:
            featureDict[w] += 1.0/len(tokens)
        except KeyError:
            featureDict[w] = 1.0/len(tokens)
            
    tagged_tokens = nltk.pos_tag(tokens)
    
    pos_tags = [t[1] for t in tagged_tokens]
    tag_set = set(pos_tags)
    tag_dict = {k : pos_tags.count(k)/len(pos_tags) for k in tag_set}
    
    featureVec.update(tag_dict)
    
    return featureVec

Here we split the data with no added polarity values. 

In [427]:
full_trainData = []
testData = []


for (speech, _, gender) in training_set:
    full_trainData.append((toFeatureVector(preProcess(speech)), gender))

for (speech, _, gender) in testing_set:
    testData.append((toFeatureVector(preProcess(speech)), gender))


Here we split the data if we want to use polarity values.

In [450]:
full_trainData = []
testData = []


for (speech, _, gender) in training_set:
    mydict = toFeatureVector(preProcess(speech))
    mydict.update(Polarity(speech))
    full_trainData.append((mydict, gender))

for (speech, _, gender) in testing_set:
    mydict = toFeatureVector(preProcess(speech))
    mydict.update(Polarity(speech))
    testData.append((mydict, gender))


In [451]:
print(full_trainData[2], testData[3])

({'last': 0.16666666666666666, 'night': 0.16666666666666666, 'better': 0.16666666666666666, 'ever': 0.16666666666666666, 'anything': 0.16666666666666666, 'interesting': 0.16666666666666666, 'RB': 0.16666666666666666, 'RBR': 0.16666666666666666, 'JJ': 0.3333333333333333, 'NN': 0.3333333333333333, 'POL': 0.7149}, 'male') ({'hang': 0.3333333333333333, 'pellets': 0.3333333333333333, 'cupboard': 0.3333333333333333, 'NNS': 0.3333333333333333, 'VBP': 0.3333333333333333, 'NN': 0.3333333333333333, 'POL': 0.0}, 'female')


In [494]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('tfidf', TfidfTransformer()), ('SDG', SGDClassifier(alpha= 0.0001, penalty= 'elasticnet', max_iter= 3000, random_state= 4))])
    return SklearnClassifier(pipeline).train(trainData)

In [495]:
classifier = trainClassifier(full_trainData)

Training Classifier...


In [496]:
results = []

testTrue = [t[1] for t in testData]
testPred = predictLabels(testData, classifier)
precision, recall, fscore, _ = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
results.append((precision, recall, fscore)) # append results obtained for each training set 
results = (np.mean(np.array(results), axis = 0)) # average the cv results for precision, recall and fscore
   # print(cv_results)
    
print("Done training!")
print("Precision %f\nRecall: %f\nF Score:%f" % (results[0], results[:1], results[2]))   
# QUESTION 3 - Make sure there is a function call here to the
# crossValidate function on the training set to get your results


Done training!
Precision 0.587784
Recall: 0.587784
F Score:0.584851


Results for SWR, POS tag, TF-IDF:

- Precision 0.576786
- Recall: 0.576786
- F Score:0.574224

Results for SWR, POS tag, Polarity, TF-IDF:

- Precision 0.587784
- Recall: 0.587784
- F Score:0.584851

The combination SWR, POS tag, Polarity, TF-IDF despite scoring slightly lower on the dev set, scored higher on the full testing set, with an f-score accuracy value of 0.585. Surprisingly however the very simplest of model run as baseline continues to outperform the complex model above, with an f-score accuracy value of 0.599. This makes us wonder if there is an error in our process, but if so it isn't immediately apparent to us. This being so, strictly speaking we would have to choose the simpler model over the more complex one. 