In [None]:
#final code for capturing discriminative features to be written here

In [None]:
#import all file in this block
import gensim
from gensim.models import Word2Vec
from pprint import pprint
import math
import numpy as np

In [None]:
#declaration of global variables

In [None]:
#readDataFiles : reads the file and return list of list of words, and label if necessary
#path_var: variable for path of file to be read, mode: type of data(test - 1 or trian - 0)
def readDataFiles(path_var, mode):
    data = []
    label = []
    with open(path_var , 'r') as file_obj:
        for line in file_obj:
            temp = line.split(',')
            data.append(temp[:3])
            if(1 != mode):
                label.append(temp[-1][0]) #temp[-1][0] since the label is using single digit for representation
    if(1 != mode):
        return data, label
    else:
        return data

In [None]:
#generateModel : generates the word2vec model for the data gathered
def generateModel(train_set, val_set, test_set):
    word_set = train_set + val_set + test_set
    word_model = gensim.models.Word2Vec(word_set, min_count = 1, size = 30, window = 5)
    return word_model

In [None]:
def getCosineVector(data, word_model, fv_size):
    n = len(data)
    X = np.matrix(np.zeros((n, fv_size)))
#    print(X.shape)
    i = 0
    for i in range(n):
        v = np.zeros(fv_size)
        v[0] = word_model.similarity(data[i][0], data[i][2])
        v[1] = word_model.similarity(data[i][1], data[i][2])
#        print(v[0],v[1])
        X[i, : ] = v
#        print ("   ", X[i])
        i += 1
    return X

In [None]:
#class 0 is represented by index 0 and 1 by index 1 in rows of Y matrix 
def getClassVector(data, no_class):
    n = len(data)
    Y = np.matrix(np.zeros((n, no_class)))
    i = 0
    for i in range(n):
        Y[i, int(data[i])] = 1
    return Y

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
def logisticRegression(train_X, train_Y, epochs, learning_rate):
    size, m = train_X.shape
    train_X = np.hstack((train_X, np.ones((size, 1))))
    m, n = (train_X.shape[1], train_Y.shape[1])
    w = np.random.rand(m, n)
    w = w / 1000
    for epoch in range(epochs):
        print (epoch, end = " ")
        for index in range(size):
            inp = train_X[index]
            target = train_Y[index]
            op = np.matmul(inp, w)
            op = sigmoid(op)
            diff = op - target
            update = np.matmul(inp.T, diff)
            w = w - learning_rate * update
    print()
    return w

In [None]:
def train(train_X, train_Y, model, epochs, learning_rate):
    if(0 == model):
        w = logisticRegression(train_X, train_Y, epochs, learning_rate)
        return w

In [None]:
def predict(inp , w):
    op = np.matmul(inp, w)
    return np.argmax(op)

In [None]:
def correct_tag(target):
    return np.argmax(target)

In [None]:
def validate(data_X, data_Y, w):
    i = 0
    correct = 0
    size = len(data_X)
    for index in range(size):
        inp = np.matrix(np.hstack((data_X[index], np.ones((1, 1)))))
        target = data_Y[index]
        if predict(inp , w) == correct_tag(target):
            correct += 1
        i += 1
    return (correct / i)*100

In [None]:
# main function
def main():
    
    #setting of path to take data
    train_path = "./training/train.txt"
    val_path = "./training/validation.txt"
    test_path = "./test/test_triples.txt"
    
    #reading data from the respective files
    train_set, label_train = readDataFiles(train_path, 0)
    val_set, label_val = readDataFiles(val_path, 0)
    test_set = readDataFiles(test_path, 1)
    
    #generating word2vec model for the data
    word_model = generateModel(train_set, val_set, test_set)
    
    #generating feature vector to use in the classifier. uses cosine similarity for feature vector
    fv_size = 2
    train_X = getCosineVector(train_set, word_model, fv_size)
    
    #generating training label vectors depending on class
    no_class = 2
    train_Y = getClassVector(label_train, no_class)
    
    #training the model usning the vector consisting of cosine similarity values
    model = 0  #0 means logistic regression and 1 means naive bayes
    epochs = 100
    learning_rate = 0.03
    w = train(train_X, train_Y, model, epochs, learning_rate)
    pprint (w)
    
    #validating the weight_vector on val matrix
    val_X = getCosineVector(val_set, word_model, fv_size)
    val_Y = getClassVector(label_val, no_class)
    aq = validate(val_X, val_Y, w)
    print(aq)


In [None]:
if __name__ == '__main__':
    main()