In [1]:
from __future__ import division
import csv
import pandas as pd
import numpy as np
import matplotlib as mp
from collections import Counter
import random
import math

In [2]:
def openCSVFile(file):
    f = open(file)
    csvreader = csv.csvreader(f)
    file_data = list(csvreader)
    return file_data

In [3]:
def tokenizeData(data):
    split_data = []
    for row in data:
        split_data.append(row.split(" "))
    return split_data    

In [4]:
def cleanData(data,special_characters = [".",",",":",";","(",")","-","_"]):   
    data_ =[]
    for row in data:
        row= row.strip()
        for sc in special_characters:
            row = row.replace(sc,'')
        data_.append(row)
    return data_

In [5]:
def openFile(file,clean=False,tokenize=False):
    f = open(file)
    raw_data = f.read()        
    split_data = raw_data.split("\n")
    if(clean):
        split_data = cleanData(split_data)
    if(tokenize):
        split_data = tokenizeData(split_data)
    return split_data

In [6]:
def bag_of_words(data):
    bag_of_words = []
    for row in data:
        bag_of_words.append(Counter(row))
    return bag_of_words

Reading data

In [7]:
file = "DocumentClassificationTrainingData.txt"

In [8]:
data = openFile(file,True,True)

In [9]:
target = "1"
other = "2"
data = data[1:]

Creating features

In [10]:
binary_data = []
y = []
for row in data:
    if row[0]== target or row[0] == other:
        binary_data.append(row[1:])
        y.append(row[0])

In [11]:
trainingSetHx,trainingSetYx = binary_data[:100],y[:100]

In [88]:
lenghts = [len(row) for row in trainingSetHx]

In [12]:
b_words = bag_of_words(trainingSetHx)

In [13]:
Df = pd.DataFrame(b_words)

In [86]:
Df = Df.fillna(0)

In [89]:
Df["lenght"] = lenghts

In [90]:
hx = Df.values
yx = np.asarray([int(y) for y in trainingSetYx])

Defining Classificator functions

In [16]:
def initializeW(hx):
    W = np.random.rand(len(hx[0]))
    return np.array(W)

In [22]:
def logistic(x):
    return 1.0/(1+math.exp(-x))

In [23]:
def getProbabilities(hx,W):
    return np.asarray([logistic(np.dot(hx_i,W)) for hx_i in hx])

In [24]:
def getGradient_i(error,hx_j):    
    gradient = np.dot(error,hx_j)
    return gradient

In [26]:
def getLogLikelihood_i(y_i,yh_i):
    try:
        if(y_i==1):
            return math.log(yh_i)
        else:
            return math.log(1-yh_i)
    except Exception:
        return math.log(0.0000000001)

In [69]:
def compute_log_likelihood(hx, yx, W):
    indicator = (yx==+1)
    scores = np.dot(hx, W)
    logexp = np.log(1. + np.exp(-scores))
    
    # Simple check to prevent overflow
    mask = np.isinf(logexp)
    logexp[mask] = -scores[mask]
    
    lp = np.sum((indicator-1)*scores - logexp)
    return lp

In [27]:
def getLogLikelihood(yx,yh):
    return sum([getLogLikelihood_i(y_i,yh_i) for y_i,yh_i in list(zip(yx,yh))])

In [28]:
def setClass(yh):
    return np.asarray([round(yh_i) for yh_i in yh])

In [108]:
def getPrecision(yx,yh):
    realPositives = 0
    falsePositives = 0
    for yi,yhi in zip(yx,yh):
        if yi == 1 and yhi == 1:
            realPositives+=1
        elif yi != 1 and yhi == 1:
            falsePositives+=1
    return realPositives / (realPositives + falsePositives)

In [105]:
def getRecall(yx,yh):
    realPositives = 0
    falseNegatives = 0
    for yi,yhi in zip(yx,yh):
        if yi == 1 and yhi == 1:
            realPositives+=1
        elif yi == 1 and yhi == 0:
            falseNegatives+=1
    return realPositives / (realPositives + falseNegatives)

Trying

In [91]:
W = initializeW(hx)

In [94]:
for x in range(1000):
    yh = getProbabilities(hx,W)
    indicator = (yx==1)
    error = (indicator-yh)
    for j,_ in enumerate(W):
        gradient_j = getGradient_i(error,hx[:,j])
        W[j] = W[j] + 0.0001*gradient_j
    #ll = compute_log_likelihood(hx,yx,W)
    ll = getLogLikelihood(yx,yh)
    print('iteration ' + str(x) + ' ll =' + str(ll))         
    

iteration 0 ll =-735.4132917096227
iteration 1 ll =-704.0331168858949
iteration 2 ll =-677.1743225288158
iteration 3 ll =-696.1176350778385
iteration 4 ll =-914.1410234810098
iteration 5 ll =-661.5635404856667
iteration 6 ll =-1130.028806021778
iteration 7 ll =-584.9129352926503
iteration 8 ll =-1327.4739758187925
iteration 9 ll =-511.36252913471117
iteration 10 ll =-1498.7833512151003
iteration 11 ll =-444.7813492809786
iteration 12 ll =-1631.551278175281
iteration 13 ll =-413.23667240294634
iteration 14 ll =-1708.8709278441158
iteration 15 ll =-341.9062388723794
iteration 16 ll =-1722.8131847265136
iteration 17 ll =-322.09638993814707
iteration 18 ll =-1719.1389273446725
iteration 19 ll =-305.96404352158754
iteration 20 ll =-1717.2919056498572
iteration 21 ll =-288.2583519385584
iteration 22 ll =-1712.8675860364742
iteration 23 ll =-273.41432251056995
iteration 24 ll =-1707.5425988211405
iteration 25 ll =-259.733413031412
iteration 26 ll =-1700.8360682754037
iteration 27 ll =-247.681

In [99]:
yh_c = setClass(yh)

In [110]:
precision = getPrecision(yx,yh_c)
print(precision)

0.9855072463768116


In [111]:
recall = getRecall(yx,yh_c)
print(recall)

1.0
