### INCLUDING THE LIBRARIES NEEDS FOR THE CODE

In [25]:
import os, sys, re, unicodedata, csv, unicodecsv, nltk                             
import numpy as np
import pycrfsuite

from copy                            import deepcopy
from collections                     import Counter
from matplotlib                      import pyplot as plt
from nltk                            import pos_tag, word_tokenize
from nltk.classify                   import SklearnClassifier
from nltk.corpus                     import stopwords
from nltk.tag                        import CRFTagger
from nltk.tokenize                   import RegexpTokenizer
from nltk.stem                       import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer
from random                          import shuffle
from sklearn                         import metrics
from sklearn.feature_selection       import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics                 import confusion_matrix, precision_recall_fscore_support, classification_report
from sklearn.model_selection         import cross_val_score
from sklearn.svm                     import LinearSVC
from sklearn.pipeline                import Pipeline

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### LOADING DATA AND SPLITTING DATA INTO TRAIN AND TEST
1. TOOK THE COMMA SEPARATED FILE AS INPUT
2. READ THE DOCUMENT LINE BY LINE
3. PARSED LINE BY LINE 
4. SPLIT THE DOCUMENT INTO TWO SETS(TRAINING AND TEST SET)
5. THE PERCENTAGE IS PROVIDED AS INPUT FOR SPLITTING (GENERALLY 80% CONSIDERED TRAINING DATA WHILE SPLITTING)
6. THE PREPROCESS AND THE FEATURE CONVERSION TAKES PLACE DURING SPLITTING (BY CALLING THEIR RESPECTIVE USER DEFINED FUNCTIONS)

In [26]:
def get_raw_data_from_file(fpath, reviewText=None):
    with open(fpath, 'rb') as f:
        reader = unicodecsv.reader(f, delimiter=',')
        next(reader)
        for line in reader:
            (Text, Label) = parseReview(line)
            rawData.append((Text, Label))
    return rawData

In [27]:
def splitData_1(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector_1(preProcess(Text)), Label))
    for (Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector_1(preProcess(Text)), Label))

In [28]:
def splitData_2(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector_2(preProcess(Text)), Label))
    for (Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector_2(preProcess(Text)), Label))

### PARSING DATA:

1. SPLIT INPUT INTO FOLLOWING PARTS:
    * TEXT
    * LABEL
2. THE LABEL FROM INPUT IS ASSIGNED VALUE AS PER THE REQUIRED OUTPUT CLASSES  <b>(<i>male, female</i>)</b>
3. THE FUNCTION RETURNS A TUPLE: <b>(<i>Text, Label</i>)</b>

In [29]:
def parseReview(reviewLine):
    Text  = reviewLine[0]
    Label = labelMap[reviewLine[2]]
    return (Text, Label)

### PREPROCESSING:

6. CREATED TOKENS OF INDIVIDUAL WORDS USING STANDARD FUNCTION <b>(<i>nltk.word_tokenize())</i>))</b>
6. REMOVED THE SPECIAL CHARACTERS USING <b>(<i> RegexpTokenizer(r'\w+')</i>))</b>
8. CONVERTED ALL TOKENS TO LOWERCASE (using regex)
8. LEMMATIZED TOKENS USING STANDARD FUNCTION <b>(<i>nltk.stem.WordNetLemmatizer.lemmatize()</i>))</b>
8. TRIED THE FOLLOWING STEMMING. BUT THE FEATURES REDUCED CONSIDERABLY. SO WITHDRAV MYSELF FROM USING IT. 
    *  <b>(<i>nltk.stem.PorterStemmer.stem()</i>)</b>
    *  <b>(<i>nltk.stem.LancasterStemmer.stem()</i>)</b>
    *  <b>(<i>nltk.stem.SnowballStemmer.stem()</i>)</b>
8. INCLUDED THE STOPWORDS FROM CORPUS TO CHECK AND REMOVE THEM FROM TOKENS DURING LEMMATATION PROCESS 
    *  <b>(<i>nltk.corpus.stopwords.words('english')</i>))</b>
8. CREATED TOKEN POS TAG PAIR USING <b>(<i>pos_tag()</i>))</b>
8. BIGRAM WAS USED TO FURTHER ENHANCE THE FEATURE COUNT nltk.util import ngrams
9. THE FUNCTION RETURNS TOKEN LIST THUS COMPLETING THE TOKENIZATION PROCESS, AND THE PREPROCESSING.


In [30]:
def preProcess(text):
    
    if text == '':
        text = "UNK"
    ###=======================================###
      ###### RegExp Remove Punctuation ###### 
    ###=======================================###
        
    def try_regexToken(text):
        #print("inside regex",text)
        #should return a list of tokens
        #word tokenisation, including punctuation removal'
        tokenizer = RegexpTokenizer(r'\w+')
        text = tokenizer.tokenize(text)      
        return text
    text = try_regexToken(text)
    regexToken = text
    forbi = text
        
    ###=======================================###
        ###### Retain Punctuation ###### 
    ###=======================================###    
    
    def try_tokenize(text):
        text = word_tokenize(text)     
        return text
    #text = try_tokenize(text)
    #print("try_regexToken", text)    

    ###=======================================###
            ###### POS TAG ###### 
    ###=======================================###
    
    def try_pos_tag(text):
        #print("inside postag", text)
        #token is parsed with pos tag 
        text = pos_tag(text)
        return text
    text = try_pos_tag(text)
    #print("try_pos_tag", text)
    pos = text
    ###=======================================###
            ###### LOWERCASE ###### 
    ###=======================================###
    
    #lowercasing
    text = [(t.lower(),p) for t,p in text]
    #print(text)
    
    ###=======================================###
            ###### STOPWORDS ###### 
    ###=======================================###
    
    def try_stopwords(text):
        #stopword removal- benefits are it removes rare words
        #print("inside stop", text)
        stop = set(stopwords.words('english'))
        text = [(t,p) for t,p in text if t not in stop]
        #print("stop", text)
        return text
    text = try_stopwords(text)
    #print("try_stopwords", text)
    stop = text
    ###=======================================###
            ###### LEMMATIZE ###### 
    ###=======================================###
    
    def try_lemmatize(text):
        #print("inside lemma", text)
        #lemmatisation
        lemmatiser = WordNetLemmatizer()
        t = [(lemmatiser.lemmatize(text),pos) for text,pos in text]
        #print("after", t)
        return text
    text = try_lemmatize(text)

    lemma = text
    
    ###=======================================###
            ###### STEMMING ###### 
    ###=======================================###    
    
    porter = PorterStemmer()
    lancaster = LancasterStemmer()
    snow = SnowballStemmer("english") 
    stemmed_tokens = []
    
    #stemming PorterStemmer, LancasterStemmer, SnowballStemmer
    def PstemTokens(text):
        for text,pos in text:
            stemmed_tokens.append((porter.stem(text),pos))
        return stemmed_tokens
    
    def LstemTokens(text):
        for text,pos in text:
            stemmed_tokens.append((lancaster.stem(text),pos))
        return stemmed_tokens
    
    def SstemTokens(text):
        for text,pos in text:
            stemmed_tokens.append((snow.stem(text),pos))
        return stemmed_tokens
            
    stemmed_tokens = PstemTokens(text)
    #stemmed_tokens = LstemTokens(text) 
    #stemmed_tokens = SstemTokens(text) 
    text = stemmed_tokens
    
    bigram = [' '.join(l) for l in nltk.bigrams(forbi)] + forbi
    tokens = text, bigram


    #return regexToken
    #return pos
    #return stop
    #return lemma
    #return stemmed_tokens
    return tokens

t = preProcess("Hi, this is gk...better and good getting to see! The striped bats are hanging on their feet for best")
print(t)

([('hi', 'NNP'), ('gk', 'JJ'), ('better', 'RBR'), ('good', 'JJ'), ('get', 'VBG'), ('see', 'VB'), ('stripe', 'JJ'), ('bat', 'NNS'), ('hang', 'VBG'), ('feet', 'NNS'), ('best', 'JJS')], ['Hi this', 'this is', 'is gk', 'gk better', 'better and', 'and good', 'good getting', 'getting to', 'to see', 'see The', 'The striped', 'striped bats', 'bats are', 'are hanging', 'hanging on', 'on their', 'their feet', 'feet for', 'for best', 'Hi', 'this', 'is', 'gk', 'better', 'and', 'good', 'getting', 'to', 'see', 'The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best'])


### CONVERTING THE TOKENS TO FEATURES

1.  THE GLOBAL DICTIONARY IS FILLED WITH THE FEATURES THAT ARE ENCOUNTED DURING THE WHOLE FEATURE VECTOR CREATION PROCESS 
2.  WEIGHTED FEATURE VALUES HAVE BEEN USED FOR EACH POS TAG
3.  THE SECOND FUNCTION USING BIGRAM TOKEN HAVE BEEN CREATED 
4.  BOTH EXECUTED INDEPENDENTLY TO CHECK THE SCORES

In [31]:
featureDict = {} # the global feature dictionary

def toFeatureVector_1(tokens):
    featureVec = {}
    prev_tag = {}
    text_token = tokens[0]
    bigram_token = tokens[1]
    for k, pos in text_token:
        try:
            sum = 0
            featureVec[k] += 1.0/len(tokens)
            #using pos_tags
            if k in prev_tag:
                if pos != prev_tag[k]:
                    featureVec[k] -= 1.0/len(tokens)
                else:
                    featureVec[k] += 1.0/len(tokens)
            prev_tag[k] = pos    
        except KeyError:
            featureVec[k] = 1.0/len(tokens)  
    #print(featureVec)
    return featureVec

In [32]:
featureDict = {} # the global feature dictionary

def toFeatureVector_2(tokens):
    featureVec = {}
    text_token = tokens[0]
    bigram_token = tokens[1]
    for w in bigram_token:
        try:
            featureVec[w] += 1.0/len(bigram_token)
        except KeyError:
            featureVec[w] = 1.0/len(bigram_token)
        try:
            featureDict[w] += 1.0/len(bigram_token)
        except KeyError:
            featureDict[w] = 1.0/len(bigram_token)
    #print(featureVec)
    return featureVec

### TRAINING AND VALIDATING OUR CLASSIFIER

In [33]:
def trainClassifier_1(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC(loss='squared_hinge', penalty='l2', random_state=0, tol=1e-04))])
    return SklearnClassifier(pipeline).train(trainData)

In [34]:
def trainClassifier_2(trainData):
    print("Training Classifier...")

    pipeline =  Pipeline([('tfidf', TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)),
                          ('chi2', SelectKBest(chi2, k=1000)),
                          ('svc', LinearSVC(loss='squared_hinge', penalty='l2', random_state=0, tol=1e-04))])    
                          
    return SklearnClassifier(pipeline).train(trainData)

In [35]:
def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

### CROSS VALIDATING

1. THE 10 FOLD CROSS VALIDATION IS PERFORMED WHERE THE TRAINING DATA (80% OF THE WHOLE DATA) IS SPLIT IS 10 FOLDS AND EACH FOLD ACT AS TEST DATA AND THE REMAINING 10 FOLD ARE TRAINING DATA. 
2. THIS IS ACCOMPLISHED BY PUTTING THE FOLDS IN LOOP AND ROTATING THE FOLDS TO ACT AS TRAINING AND TEST DATA 
3. THUS IT FOLLOWS THE PATTERN: 
    * TRAIN DATA: [:i]  
    * TEST DATA [i:foldSize+i] 
    * TRAIN DATA [foldSize+i:] 
    COMBINED PROPERLY IN THE CODE WITH CONCATENATION 
4. THE MAIN CODE CALLS THE CROSS VALIDATE WITH 80% OF THE MAIN DATASET AND THE NUMBER OF FOLDS AS PARAMETERS
4. THE CLASSIFIER IS TRAINED USING THE <b>(<i> LinearSVC(), TfidfTransformer() AND SelectKBest() </i>)</b>
5. THE TEST DATA LABELS ARE GATHERED BEFORE PREDICTING ON THE TEST DATA
6. THE GATHERED LABELS AND PREDICTED LABELS ARE PASSED ON TO STANDARD FUNCTIONS TO IDENTIFY THE FOLLOWING:
    * PRECISON <b>(<i>sklearn.metrics.precision_recall_fscore_support()</i>)</b>
    * RECALL <b>(<i>sklearn.metrics.precision_recall_fscore_support()</i>)</b>
    * F-SCORE <b>(<i>sklearn.metrics.precision_recall_fscore_support()</i>)</b>
    * THE ABOVE VALUES ARE APPENDED TO A LIST
7. THE AVERAGE OF EACH SCORE IS TAKEN USING MEAN 
8. FINAL RESULT IS RETURNED TO THE CALLING CODE

In [36]:
def crossValidate_1(dataset, folds):
    shuffle(dataset)
    results = []
    foldSize = int(len(dataset)/folds)
    
    for i in range(0,len(dataset),int(foldSize)):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        print("Fold start on items %d - %d" % (i, i+foldSize))
        myTestData = dataset[i:i+foldSize]
        myTrainData = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier_1(myTrainData)
        y_true = [x[1] for x in myTestData]
        y_pred = predictLabels(myTestData, classifier)
        print(len(myTestData))
        results.append(precision_recall_fscore_support(y_true, y_pred, average='weighted'))
    avgResults = [np.mean([x[0] for x in results]),
                   np.mean([x[1] for x in results]),
                   np.mean([x[2] for x in results])
                ]
    return avgResults

In [37]:
def crossValidate_2(dataset, folds):
    shuffle(dataset)
    results = []
    foldSize = int(len(dataset)/folds)
    
    for i in range(0,len(dataset),int(foldSize)):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        print("Fold start on items %d - %d" % (i, i+foldSize))
        myTestData = dataset[i:i+foldSize]
        myTrainData = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier_2(myTrainData)
        y_true = [x[1] for x in myTestData]
        y_pred = predictLabels(myTestData, classifier)
        print(len(myTestData))
        results.append(precision_recall_fscore_support(y_true, y_pred, average='weighted'))
    avgResults = [np.mean([x[0] for x in results]),
                   np.mean([x[1] for x in results]),
                   np.mean([x[2] for x in results])
                ]
    return avgResults

In [38]:
rawData = [] # the filtered data from the dataset file (should be 21000 samples)
trainData = [] # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = [] # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
Label1 = 'male'
Label2 = 'female'
labelMap = {'male' : Label1, 'female' : Label2}

# references to the data files

# We parse the dataset and put it in a raw data list
print("Now %d Raw Data, %d Train Data, %d Test Data" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')

rawData = get_raw_data_from_file("training.csv") 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d Train Data, %d Test Data" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData_1(0.8)
print("Now %d rawData, %d Train Data, %d Test Data" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
print("K Fold cross-validation: ")
print('\n')
CV_Results_1_1 = crossValidate_1(trainData, 10)
CV_Results_1_2 = crossValidate_2(trainData, 10)

Now 0 Raw Data, 0 Train Data, 0 Test Data
Preparing the dataset...
Now 10112 rawData, 0 Train Data, 0 Test Data
Preparing training and test data...
Now 10112 rawData, 8088 Train Data, 2024 Test Data
Preparing training and test data...
K Fold cross-validation: 


Fold start on items 0 - 808
Training Classifier...
808
Fold start on items 808 - 1616
Training Classifier...
808
Fold start on items 1616 - 2424
Training Classifier...
808
Fold start on items 2424 - 3232
Training Classifier...
808
Fold start on items 3232 - 4040
Training Classifier...
808
Fold start on items 4040 - 4848
Training Classifier...
808
Fold start on items 4848 - 5656
Training Classifier...
808
Fold start on items 5656 - 6464
Training Classifier...
808
Fold start on items 6464 - 7272
Training Classifier...
808
Fold start on items 7272 - 8080
Training Classifier...
808
Fold start on items 8080 - 8888
Training Classifier...
8
Fold start on items 0 - 808
Training Classifier...
808
Fold start on items 808 - 1616
Training 

### Evaluate on test set

In [39]:
# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0:10])   # have a look at the first test data instance
    classifier = trainClassifier_1(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
    POSTagging_1 = classification_report(testTrue, testPred)
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    print("Done Testing!")

[({'alright': 0.5, 'tri': 0.5, 'get': 0.5, 'time': 0.5, 'work': 0.5}, 'male'), ({}, 'female'), ({'need': 0.5, 'wait': 0.5}, 'female'), ({'key': 0.5, 'heart': 0.5}, 'female'), ({'listen': 0.5, 'miss': 0.5, 'marpl': 0.5, 'need': 0.5, 'know': 0.5, 'gonna': 0.5, 'meet': 0.5, 'r': 1.0, 'later': 0.5, 'write': 0.5}, 'female'), ({'suppos': 0.5, 'peel': 0.5, 'spud': 0.5, 'sure': 0.5, 'find': 0.5, 'someon': 0.5, 'cover': 0.5}, 'male'), ({'believ': 0.5, 'back': 0.5}, 'female'), ({'well': 0.5, 'invit': 0.5, 'mind': 0.5}, 'male'), ({'poke': 0.5, 'around': 0.5, 'wind': 0.5}, 'female'), ({'deserv': 0.5, 'anyway': 0.5, 'brought': 0.5}, 'female')]
Training Classifier...
Done Testing!


In [40]:
# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0:10])   # have a look at the first test data instance
    classifier = trainClassifier_2(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
    POSTagging_2 = classification_report(testTrue, testPred)
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    print("Done Testing!")

[({'alright': 0.5, 'tri': 0.5, 'get': 0.5, 'time': 0.5, 'work': 0.5}, 'male'), ({}, 'female'), ({'need': 0.5, 'wait': 0.5}, 'female'), ({'key': 0.5, 'heart': 0.5}, 'female'), ({'listen': 0.5, 'miss': 0.5, 'marpl': 0.5, 'need': 0.5, 'know': 0.5, 'gonna': 0.5, 'meet': 0.5, 'r': 1.0, 'later': 0.5, 'write': 0.5}, 'female'), ({'suppos': 0.5, 'peel': 0.5, 'spud': 0.5, 'sure': 0.5, 'find': 0.5, 'someon': 0.5, 'cover': 0.5}, 'male'), ({'believ': 0.5, 'back': 0.5}, 'female'), ({'well': 0.5, 'invit': 0.5, 'mind': 0.5}, 'male'), ({'poke': 0.5, 'around': 0.5, 'wind': 0.5}, 'female'), ({'deserv': 0.5, 'anyway': 0.5, 'brought': 0.5}, 'female')]
Training Classifier...
Done Testing!


In [41]:
print("Now %d rawData, %d Train Data, %d Test Data" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData_2(0.8)
print("Now %d rawData, %d Train Data, %d Test Data" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
print("K Fold cross-validation: ")
print('\n')
CV_Results_2_1 = crossValidate_1(trainData, 10)
CV_Results_2_2 = crossValidate_2(trainData, 10)

Now 10112 rawData, 8088 Train Data, 2024 Test Data
Preparing training and test data...
Now 10112 rawData, 16176 Train Data, 4048 Test Data
Preparing training and test data...
K Fold cross-validation: 


Fold start on items 0 - 1617
Training Classifier...
1617
Fold start on items 1617 - 3234
Training Classifier...
1617
Fold start on items 3234 - 4851
Training Classifier...
1617
Fold start on items 4851 - 6468
Training Classifier...
1617
Fold start on items 6468 - 8085
Training Classifier...
1617
Fold start on items 8085 - 9702
Training Classifier...
1617
Fold start on items 9702 - 11319
Training Classifier...
1617
Fold start on items 11319 - 12936
Training Classifier...
1617
Fold start on items 12936 - 14553
Training Classifier...
1617
Fold start on items 14553 - 16170
Training Classifier...
1617
Fold start on items 16170 - 17787
Training Classifier...
6
Fold start on items 0 - 1617
Training Classifier...
1617
Fold start on items 1617 - 3234
Training Classifier...
1617
Fold start on ite

In [42]:
# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0:10])   # have a look at the first test data instance
    classifier = trainClassifier_1(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
    bigram_1 = classification_report(testTrue, testPred)
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate

[({'alright': 0.5, 'tri': 0.5, 'get': 0.5, 'time': 0.5, 'work': 0.5}, 'male'), ({}, 'female'), ({'need': 0.5, 'wait': 0.5}, 'female'), ({'key': 0.5, 'heart': 0.5}, 'female'), ({'listen': 0.5, 'miss': 0.5, 'marpl': 0.5, 'need': 0.5, 'know': 0.5, 'gonna': 0.5, 'meet': 0.5, 'r': 1.0, 'later': 0.5, 'write': 0.5}, 'female'), ({'suppos': 0.5, 'peel': 0.5, 'spud': 0.5, 'sure': 0.5, 'find': 0.5, 'someon': 0.5, 'cover': 0.5}, 'male'), ({'believ': 0.5, 'back': 0.5}, 'female'), ({'well': 0.5, 'invit': 0.5, 'mind': 0.5}, 'male'), ({'poke': 0.5, 'around': 0.5, 'wind': 0.5}, 'female'), ({'deserv': 0.5, 'anyway': 0.5, 'brought': 0.5}, 'female')]
Training Classifier...


In [43]:
# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0:10])   # have a look at the first test data instance
    classifier = trainClassifier_2(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
    bigram_2 = classification_report(testTrue, testPred)
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate

[({'alright': 0.5, 'tri': 0.5, 'get': 0.5, 'time': 0.5, 'work': 0.5}, 'male'), ({}, 'female'), ({'need': 0.5, 'wait': 0.5}, 'female'), ({'key': 0.5, 'heart': 0.5}, 'female'), ({'listen': 0.5, 'miss': 0.5, 'marpl': 0.5, 'need': 0.5, 'know': 0.5, 'gonna': 0.5, 'meet': 0.5, 'r': 1.0, 'later': 0.5, 'write': 0.5}, 'female'), ({'suppos': 0.5, 'peel': 0.5, 'spud': 0.5, 'sure': 0.5, 'find': 0.5, 'someon': 0.5, 'cover': 0.5}, 'male'), ({'believ': 0.5, 'back': 0.5}, 'female'), ({'well': 0.5, 'invit': 0.5, 'mind': 0.5}, 'male'), ({'poke': 0.5, 'around': 0.5, 'wind': 0.5}, 'female'), ({'deserv': 0.5, 'anyway': 0.5, 'brought': 0.5}, 'female')]
Training Classifier...


In [44]:
def get_raw_data_from_file(fpath, reviewText=None):
    with open(fpath, 'rb') as f:
        reader = unicodecsv.reader(f, delimiter=',')
        next(reader)
        for line in reader:
            (Text, Label) = parseReview(line)
            rawData.append((Text, Label))
    return rawData
rawData = get_raw_data_from_file("test.csv")
print(rawData[0:3])

def formatData():
    for (Text, Label) in rawData:
        #testData.append((toFeatureVector_1(preProcess(Text)), Label))
        testData.append((toFeatureVector_2(preProcess(Text)), Label))
formatData()      
print(testData[0:3])

functions_complete = True  
if functions_complete:   
    classifier = trainClassifier_2(trainData)  
    testTrue = [t[1] for t in testData]  
    testPred = predictLabels(testData, classifier) 
    final = classification_report(testTrue, testPred)
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') 
    print("Done Final Testing!")


[("It's no problem, honestly. Go on, go and open the launderette.  Leave it with me.", 'female'), ("Last night was better than ever. What's all this?  Anything interesting?", 'male'), ('Have you checked the answerphone?  Any calls?', 'male')]
[({'alright': 0.5, 'tri': 0.5, 'get': 0.5, 'time': 0.5, 'work': 0.5}, 'male'), ({}, 'female'), ({'need': 0.5, 'wait': 0.5}, 'female')]
Training Classifier...
Done Final Testing!


### SUMMARY

In [45]:
print('\n')
print("------------------RESULTS OF CROSS VALIDATION - TRAINING DATA SET(80/20 SPLIT)----------------")
print('\n')
print("USING POS TAGGING ALONG WITH REGEX TOKENIZING, LEMMATIZING, STEMMING AND STOPWORDS & LINEAR SVC: ")
print(
      "Precision:", CV_Results_1_1[0],
      ", Recall:", CV_Results_1_1[1],
      ", F Score", CV_Results_1_1[2])
print('\n')
print("USING POS TAGGING ALONG WITH REGEX TOKENIZING, LEMMATIZING, STEMMING AND STOPWORDS & LinearSVC, TfidfTransformer AND SelectKBest: ")
print(
      "Precision:", CV_Results_1_2[0],
      ", Recall:", CV_Results_1_2[1],
      ", F Score", CV_Results_1_2[2])
print('\n')
print("USING BIGRAMS ALONG WITH REGEX TOKENIZING, LEMMATIZING, STEMMING AND STOPWORDS & LinearSVC: ")
print(
      "Precision:", CV_Results_2_1[0],
      ", Recall:", CV_Results_2_1[1],
      ", F Score", CV_Results_2_1[2])
print('\n')
print("USING BIGRAMS ALONG WITH REGEX TOKENIZING, LEMMATIZING, STEMMING AND STOPWORDS & LinearSVC, TfidfTransformer AND SelectKBest: ")
print(
      "Precision:", CV_Results_2_2[0],
      ", Recall:", CV_Results_2_2[1],
      ", F Score", CV_Results_2_2[2])



------------------RESULTS OF CROSS VALIDATION - TRAINING DATA SET(80/20 SPLIT)----------------


USING POS TAGGING ALONG WITH REGEX TOKENIZING, LEMMATIZING, STEMMING AND STOPWORDS & LINEAR SVC: 
Precision: 0.5645839500020696 , Recall: 0.5630063006300629 , F Score 0.5617520084317323


USING POS TAGGING ALONG WITH REGEX TOKENIZING, LEMMATIZING, STEMMING AND STOPWORDS & LinearSVC, TfidfTransformer AND SelectKBest: 
Precision: 0.5796718974568399 , Recall: 0.5586183618361836 , F Score 0.5463174533342594


USING BIGRAMS ALONG WITH REGEX TOKENIZING, LEMMATIZING, STEMMING AND STOPWORDS & LinearSVC: 
Precision: 0.596898834256199 , Recall: 0.581435880137179 , F Score 0.581367078556625


USING BIGRAMS ALONG WITH REGEX TOKENIZING, LEMMATIZING, STEMMING AND STOPWORDS & LinearSVC, TfidfTransformer AND SelectKBest: 
Precision: 0.5780102463186285 , Recall: 0.5685331984033283 , F Score 0.5587508909586849


In [46]:
print('\n')
print("------------------THE CLASSIFICATION REPORT - TEST DATA SET(80/20 SPLIT)----------------")
print('\n')
print("USING POS TAGGING ALONG WITH REGEX TOKENIZING, LEMMATIZING, STEMMING AND STOPWORDS & LINEAR SVC: ")
print(POSTagging_1)
print('\n')
print("------------------THE CLASSIFICATION REPORT - TEST DATA SET(80/20 SPLIT)----------------")
print('\n')
print("USING POS TAGGING ALONG WITH REGEX TOKENIZING, LEMMATIZING, STEMMING AND STOPWORDS & LinearSVC, TfidfTransformer AND SelectKBest: ")
print(POSTagging_2)
print('\n')
print("------------------THE CLASSIFICATION REPORT - TEST DATA SET(80/20 SPLIT)----------------")
print('\n')
print("USING BIGRAMS ALONG WITH REGEX TOKENIZING, LEMMATIZING, STEMMING AND STOPWORDS & LinearSVC: ")
print(bigram_1)
print('\n')
print("------------------THE CLASSIFICATION REPORT - TEST DATA SET(80/20 SPLIT)----------------")
print('\n')
print("USING BIGRAMS ALONG WITH REGEX TOKENIZING, LEMMATIZING, STEMMING AND STOPWORDS & LinearSVC, TfidfTransformer AND SelectKBest: ")
print(bigram_2)
print('\n')



------------------THE CLASSIFICATION REPORT - TEST DATA SET(80/20 SPLIT)----------------


USING POS TAGGING ALONG WITH REGEX TOKENIZING, LEMMATIZING, STEMMING AND STOPWORDS & LINEAR SVC: 
              precision    recall  f1-score   support

      female       0.57      0.61      0.59      1017
        male       0.58      0.53      0.55      1007

    accuracy                           0.57      2024
   macro avg       0.57      0.57      0.57      2024
weighted avg       0.57      0.57      0.57      2024



------------------THE CLASSIFICATION REPORT - TEST DATA SET(80/20 SPLIT)----------------


USING POS TAGGING ALONG WITH REGEX TOKENIZING, LEMMATIZING, STEMMING AND STOPWORDS & LinearSVC, TfidfTransformer AND SelectKBest: 
              precision    recall  f1-score   support

      female       0.55      0.69      0.61      1017
        male       0.58      0.43      0.49      1007

    accuracy                           0.56      2024
   macro avg       0.57      0.56      0

In [48]:
print("------------------THE CLASSIFICATION REPORT - Test.csv ----------------")
print('\n')
print(final)

------------------THE CLASSIFICATION REPORT - Test.csv ----------------


              precision    recall  f1-score   support

      female       0.59      0.74      0.66      7591
        male       0.66      0.50      0.57      7692

    accuracy                           0.62     15283
   macro avg       0.63      0.62      0.62     15283
weighted avg       0.63      0.62      0.62     15283



| macro avg - F SCORE     |       |                  
|:-|:-|
|   POS TAGGING WITH LINEAR SVC|  0.57|
|   POS TAGGING WITH LinearSVC, TfidfTransformer AND SelectKBest| 0.55  |  
|   BIGRAM WITH LINEAR SVC| 0.56  |
|   BIGRAM WITH LinearSVC, TfidfTransformer AND SelectKBest| 0.55 | 
|   TEST.CSV BIGRAM WITH LinearSVC, TfidfTransformer AND SelectKBest   | 0.62 |  