In [1]:
#Just like in P2(a), perform POS Tagging on the Brown corpus. (Like before, train your Logistic Regression model on the
#tagged corpus, and test on the untagged one). 
#Use one vs all logistic regression to perform this exercise. 
#Essentially, given a word, try to classify it with classifiers trained for all pos tags and get most probable one.
#Do NOT use any ML libraries like scipy for coding up the logistic regression. NLTK maybe allowed, but only for 
#getting corpus.



In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer
import numpy as np

In [3]:
# --------------------------------------Vocabulary of words------------------------------------------------
def numeric(stri):
    for i in range(len(stri)):
        if stri[i].isdigit():
            return False
    return True

def PreprocessData(data):
    temp = []
    for i in data:
        if i[0] != i[1] and not i[0].isdigit() and numeric(i[0]):
            if '-' in i[1]:
                ll = i[1].split('-')
                brownTag = ll[0]
            elif '+' in i[1]:
                ll = i[1].split('+')
                brownTag = ll[0]
            else:
                brownTag = i[1]
            
            brownTag = brownTag.replace('*','')
            brownTag = brownTag.replace('$','')
#             print((i[0],brownTag))
            temp.append((i[0],brownTag))
        elif i[0] == '.':
            temp.append(i)
        else:
            pass
    return temp

# ----------------------------------- Sentences ----------------------------------------------------------

def form_sentences(pdata):
    global sentences
    sent = []
    #sent.append(tuple(start_sentence1))
    #sent.append(tuple(start_sentence2))
    for i in pdata:
        vocab.add(i[0])
        if i[0] != i[1]:
            sent.append(i)
        else:
            #sent.append(tuple(end_sentence))
            sentences.append(sent)
            sent = []
            #sent.append(tuple(start_sentence1))
            #sent.append(tuple(start_sentence2))



In [4]:
def formTagDictionary(pdata):
    global tags_dict
    for p in pdata:
        tags_dict[start_sentence1[1]] = tags_dict.get(start_sentence1[1],0) + 1
        tags_dict[start_sentence2[1]] = tags_dict.get(start_sentence2[1],0) + 1
        tags_dict[end_sentence[1]] = tags_dict.get(end_sentence[1],0) + 1
        for i in p:
            tags_dict[i[1]] = tags_dict.get(i[1],0) + 1
        
def formTagWordPairDict(pdata):
    global word_tag_pair
    for p in pdata:
        for i in p:
            word_tag_pair[(i[1],i[0])] = word_tag_pair.get((i[1],i[0]),0) + 1
        
def formTagBigrams():
    global training_data
    for s in training_data:
        l = len(s)-2
        for i in range(l):
            tag_bigram[(s[i][1],s[i+1][1])] = tag_bigram.get((s[i][1],s[i+1][1]),0) + 1
        
def formTagTrigrams():
    global training_data
    for s in training_data:
        l = len(s)-3
        for i in range(l):
            tag_trigram[(s[i][1],s[i+1][1],s[i+2][1])]=tag_trigram.get((s[i][1],s[i+1][1],s[i+2][1]),0)+1

In [5]:
######################### FEATURE EXTRACTION #########################
def get_feature(token, token_index, sent):
    global token_feature
    token_feature = {
                    'token'             : token,
                    'is_first'          : token_index == 0,
                    'is_last'           : token_index == len(sent)-1,

                    'is_capitalized'    : token[0].upper() == token[0],
                    'is_all_capitalized': token.upper() == token,
                    'is_capitals_inside': token[1:].lower() != token[1:],
                    'is_numeric'        : token.isdigit(),

                    'prefix-1'          : token[0],
                    'prefix-2'          : '' if len(token) < 2  else token[:1],

                    'suffix-1'          : token[-1],
                    'suffix-2'          : '' if len(token) < 2  else token[-2:],

                    'prev-token'        : '' if token_index == 0     else sent[token_index - 1][0],
                    '2-prev-token'      : '' if token_index <= 1     else sent[token_index - 2][0],

                    'next-token'        : '' if token_index == len(sent) - 1     else sent[token_index + 1][0],
                    '2-next-token'      : '' if token_index >= len(sent) - 2     else sent[token_index + 2][0]
                    }
    return  token_feature

def feature_extraction(pdata):
    features = []
    pos_labels = []
    
    for sent in pdata:
        #print(sent)
        for token_index, token_pair in enumerate(sent):
            features.append(get_feature(token_pair[0],token_index,sent))
            pos_labels.append(token_pair[1])
#     for feat in features:
#         print(feat,"\n")
    return features,pos_labels
    

In [6]:
def costFunction(features, target, weights):
    cost = np.array([],dtype=np.float128)
    scores = np.array([],dtype=np.float128)
    scores = np.dot(features, weights)
    cost = np.sum( target*scores - np.log(1 + np.exp(scores)) )   
    return cost

def sigmoid(scores):
    return 1/(1 + np.exp(-scores))

def logistic_Regression(wordFeature,targetPOS,num_epochs,learning_rate):
    targetPOS = np.array(targetPOS)
    num_classes = len(TAG_SET)
    num_feature = wordFeature.shape[1]
    classifier = np.zeros(shape=(num_classes+1,num_feature),dtype=np.float128)
    
    for c in range(0,num_classes+1):
        print("Training for label: ",c)
        #targetLabel =  (targetPOS==c).astype(int)
        if(targetPOS==c):
            targetLabel=1
        else:
            targetLabel=0
        weights = np.zeros(wordFeature.shape[1],dtype=np.float128)
        for epoch in range(num_epochs):
            scores = np.dot(wordFeature,weights)
            predictions = sigmoid(scores)
            output_error = targetLabel - predictions
            gradient = np.dot(wordFeature.T,output_error)
            weights += (learning_rate * gradient)
            if epoch % 10 == 0:
                print("log_likelihood: ",costFunction(wordFeature, targetLabel, weights))
        classifier[c,:] = weights

    return classifier

def evaluate(weights,testdata,actualPOS):
    actualPOS = np.array(actualPOS)
    scores = np.dot(testdata, weights.T)
    scores = np.round(sigmoid(scores))
    predictions = scores.argmax(axis=1)	
    print ('Accuracy: {0}'.format((predictions == actualPOS).sum().astype(float) / len(predictions)))

In [7]:
# -------------------------------------- MAIN FUNCTION ------------------------------------------------------

import string

sentences = []
vocab = set()

start_sentence1 = ("<sos1>","<ssos1>")
start_sentence2 = ("<sos2>","<ssos2>")
end_sentence = ("<eos>","<eeos>")

raw_data = brown.tagged_words()
processed_data = PreprocessData(raw_data[:400])
form_sentences(processed_data)

training_data = []
testing_data = []

for i in range(len(sentences)):
    if i < 0.8*len(sentences):
        training_data.append(sentences[i])
    else:
        testing_data.append(sentences[i])

tag_trigram = {}
tag_bigram = {}
word_tag_pair = {}
word_word_tag_tri = {}
tags_dict = {}
transition_probs = {}
emission_probs = {}

formTagDictionary(training_data)
# formTagWordPairDict(training_data)
# formTagTrigrams()
# formTagBigrams()
# transition_probability()
# emission_probability()
TAG_SET = list(tags_dict.keys())

# print(tag_trigram)
# print(tag_bigram)
# print(transition_probs)
# print("\n\n")
# print(emission_probs)


# print(tags_dict)
print(len(tags_dict))
# print(word_tag_pair)
print(len(training_data))


42
11


In [10]:
features,pos_labels = feature_extraction(training_data)
v = DictVectorizer(sparse=False)
train_x = v.fit_transform(features)
weights = logistic_Regression(train_x,pos_labels,2,0.1)
print(type(train_x))

Training for label:  0
log_likelihood:  -4.7866974607762764345
Training for label:  1
log_likelihood:  -4.7866974607762764345
Training for label:  2
log_likelihood:  -4.7866974607762764345
Training for label:  3
log_likelihood:  -4.7866974607762764345
Training for label:  4
log_likelihood:  -4.7866974607762764345
Training for label:  5
log_likelihood:  -4.7866974607762764345
Training for label:  6
log_likelihood:  -4.7866974607762764345
Training for label:  7
log_likelihood:  -4.7866974607762764345
Training for label:  8
log_likelihood:  -4.7866974607762764345
Training for label:  9
log_likelihood:  -4.7866974607762764345
Training for label:  10
log_likelihood:  -4.7866974607762764345
Training for label:  11
log_likelihood:  -4.7866974607762764345
Training for label:  12
log_likelihood:  -4.7866974607762764345
Training for label:  13
log_likelihood:  -4.7866974607762764345
Training for label:  14
log_likelihood:  -4.7866974607762764345
Training for label:  15
log_likelihood:  -4.786697



log_likelihood:  -4.7866974607762764345
Training for label:  27
log_likelihood:  -4.7866974607762764345
Training for label:  28
log_likelihood:  -4.7866974607762764345
Training for label:  29
log_likelihood:  -4.7866974607762764345
Training for label:  30
log_likelihood:  -4.7866974607762764345
Training for label:  31
log_likelihood:  -4.7866974607762764345
Training for label:  32
log_likelihood:  -4.7866974607762764345
Training for label:  33
log_likelihood:  -4.7866974607762764345
Training for label:  34
log_likelihood:  -4.7866974607762764345
Training for label:  35
log_likelihood:  -4.7866974607762764345
Training for label:  36
log_likelihood:  -4.7866974607762764345
Training for label:  37
log_likelihood:  -4.7866974607762764345
Training for label:  38
log_likelihood:  -4.7866974607762764345
Training for label:  39
log_likelihood:  -4.7866974607762764345
Training for label:  40
log_likelihood:  -4.7866974607762764345
Training for label:  41
log_likelihood:  -4.7866974607762764345


In [11]:
features1,pos_labels1 = feature_extraction(testing_data)
v = DictVectorizer(sparse=False)
test_x = v.transform(features1)

evaluate(weights,test_x,pos_labels1)

AttributeError: 'DictVectorizer' object has no attribute 'vocabulary_'

In [10]:
from nltk.tokenize import word_tokenize
#one way is using class (optional), advantage being you can create multiple instances of class and train for each pos tag

class logisitic_regression:
    def __init__():
        pass
    def train(data,pos_tag):
        pass

## another crude way make train function for each class
def train_for_class_A(data):
    #train classifier for each pos tag in one vs all; in this particular case one will be class A and 
    #other class is rest
    pass

def train_for_class_B(data):
    pass


def read_corpus(corp):
    #Read the Brown Corpus
    #Take in one sentence at a time
    tokenize_text(sentence)
    pass

#Consider clas as positive class, and the rest as negative, and perform LR for the given token tok
def Logistic(tok, clas):
    pass

def Multi_Logistic():
    read_corpus(corpus)
    #code up one-many Logistic Regression (LR).
    #Feed in the list of tokens to return the list of tags.
    #Essentially take one of the classes as positive, and remaining as negative, and perform the standard LR.
    #Repeat this for all the classes.
    for token in tokens:
        for class1 in classes:
            Logistic(token, class1)
    pass

Task 2 : Predict tag sequence and get accuracy

In [None]:
def pred_tag_sequence(sentence):
    # Given a sentence, Get sequence of tags for it by getting most prefered tag for each word given  
    # Feel free to add helper functions more features ( like say trigram w1w2w3 as features for w2)
    pass