In [1]:
from __future__ import division
import csv
import pandas as pd
import numpy as np
import matplotlib as mp
from collections import Counter
import random
import math

In [2]:
def openCSVFile(file):
    f = open(file)
    csvreader = csv.csvreader(f)
    file_data = list(csvreader)
    return file_data

In [3]:
def tokenizeData(data):
    split_data = []
    for row in data:
        split_data.append(row.split(" "))
    return split_data    

In [4]:
def cleanData(data,special_characters = [".",",",":",";","(",")","-","_"]):   
    data_ =[]
    for row in data:
        row= row.strip()
        for sc in special_characters:
            row = row.replace(sc,'')
        data_.append(row)
    return data_

In [5]:
def openFile(file,clean=False,tokenize=False):
    f = open(file)
    raw_data = f.read()        
    split_data = raw_data.split("\n")
    if(clean):
        split_data = cleanData(split_data)
    if(tokenize):
        split_data = tokenizeData(split_data)
    return split_data

In [6]:
def bag_of_words(data):
    bag_of_words = []
    for row in data:
        bag_of_words.append(Counter(row))
    return bag_of_words

Reading data

In [7]:
file = "DocumentClassificationTrainingData.txt"

In [8]:
data = openFile(file,True,True)

In [9]:
data = data[1:]

Defining Classificator functions for sample

In [56]:
def getFeatures(data_):
    lenghts = [len(row)for row in data_]
    bag = bag_of_words(data_)
    Df = pd.DataFrame(bag)
    Df = Df.fillna(0)
    Df['lenghts1'] = lenghts
    return Df.values

In [10]:
def binarizeLabels(target,other,data_):
    """
Function to binarize data (set 1 or 0 if it's in the target class)
    :param target: label of the target class, is a string
    :param other: label of the other class, this is a list of strings
    :param data: data to be binarized
    :return: binarized labels, true labels
    """
    yx = []
    y = []
    for row in data_:
        if row[0] == target or row[0] in other:
            if(row[0]== target):
                yx.append(1)
            else:
                yx.append(0)
            y.append(row[0])
    return yx,y

In [47]:
def binarizeSampleData(target,other,data_):
    """
Function to binarize data (set 1 or 0 if it's in the target class)
    :param target: label of the target class, is a string
    :param other: label of the other class, this is a list of strings
    :param data: data to be binarized
    :return: binarized data,binarized labels, true labels
    """
    hx = []
    yx = []
    y = []
    for row in data_:
        if row[0] == target or row[0] in other:
            if(row[0]== target):
                yx.append(1)
            else:
                yx.append(0)
            y.append(row[0])
            hx.append(row[1:])
    return hx,yx,y

In [12]:
def initializeW(hx):
    W = np.random.rand(len(hx[0]))
    return np.array(W)

In [13]:
def logistic(x):
    if(x < 0):
        return 1.0 - (1.0/(1+math.exp(x)))
    else:
        return (1.0/(1+math.exp(-x)))

In [14]:
def getProbabilities(hx,W):
    return np.asarray([logistic(np.dot(hx_i,W)) for hx_i in hx])

In [15]:
def getGradient_i(error,hx_j):    
    gradient = np.dot(error,hx_j)
    return gradient

In [16]:
def getLogLikelihood_i(y_i,yh_i):
    try:
        if(y_i==1):
            return math.log(yh_i)
        else:
            return math.log(1-yh_i)
    except Exception:
        return math.log(0.0000000001)

In [17]:
def compute_log_likelihood(hx, yx, W):
    indicator = (yx==+1)
    scores = np.dot(hx, W)
    logexp = np.log(1. + np.exp(-scores))
    
    # Simple check to prevent overflow
    mask = np.isinf(logexp)
    logexp[mask] = -scores[mask]
    
    lp = np.sum((indicator-1)*scores - logexp)
    return lp

In [18]:
def getLogLikelihood(yx,yh):
    return sum([getLogLikelihood_i(y_i,yh_i) for y_i,yh_i in list(zip(yx,yh))])

In [19]:
def setClass(Probs):
    return [round(yx) for yx in Probs]

In [20]:
def setMajorityClass(ProbsTable):
    """
Function to set the predominant class to data rows.
    :param ProbsTable: matrix with the probabilities for each row to each class
    :return: Vector with class label for each row
    """
    rowCls = []
    for row in ProbsTable:
        labelCls = 1
        maxProb = None
        for i,clsProb in enumerate(row):
            if maxProb is None or maxProb < clsProb:
                maxProb = clsProb
                labelCls = i+1
        rowCls.append(labelCls)
    return rowCls

In [164]:
def LogisticRegression(maxIter,hx,yx,text=True):
    """
Function to implement Logistic Regression, takes feature matrix, true labels array (must be an array)
    :param maxIter: Max number of iterations
    :param hx: features matrix
    :param yx: true labels array
    :text: active text notifications to watch log likelihood trought iterations
    :return: probabilities (yh), logLikelihood(ll)
    """
    yx = np.asarray(yx)
    W = initializeW(hx)
    for x in range(maxIter):
        yh = getProbabilities(hx,W)
        indicator = (yx==1)
        error = indicator-yh
        for j,_ in enumerate(W):
            gradient_j = getGradient_i(error,hx[:,j])
            W[j] = W[j] + 0.0001*gradient_j
        #ll = compute_log_likelihood(hx,yx,W)
        ll = getLogLikelihood(yx,yh)
        if(text):
            print('iteration ' + str(x) + ' ll =' + str(ll))
    return yh,ll

In [22]:
def getPrecision(yx,yh):
    realPositives = 0
    falsePositives = 0
    for yi,yhi in zip(yx,yh):
        if yi == 1 and yhi == 1:
            realPositives+=1
        elif yi != 1 and yhi == 1:
            falsePositives+=1
    return realPositives / (realPositives + falsePositives)

In [23]:
def getRecall(yx,yh):
    realPositives = 0
    falseNegatives = 0
    for yi,yhi in zip(yx,yh):
        if yi == 1 and yhi == 1:
            realPositives+=1
        elif yi == 1 and yhi == 0:
            falseNegatives+=1
    return realPositives / (realPositives + falseNegatives)

Creating table to save probabilities for each target:

In [154]:
target = '1'
other = '2'

In [155]:
binary_data = []
y = []
for row in data:
    if row[0]== target or row[0] == other:
        binary_data.append(row[1:])
        y.append(row[0])

In [156]:
trainingSetHx,trainingSetYx = binary_data[:100],y[:100]

In [157]:
lenghts = [len(row) for row in trainingSetHx] 
b_words = bag_of_words(trainingSetHx)
Df = pd.DataFrame(b_words)
Df = Df.fillna(0)
Df['lenghts'] = lenghts

In [158]:
hx = Df.values
yx2 = np.asarray([int(y) for y in trainingSetYx])

In [159]:
yh,ll = LogisticRegression(1000,hx,yx,True)

iteration 0 ll =-725.9176158109316
iteration 1 ll =-726.750543007403
iteration 2 ll =-693.1780501997994
iteration 3 ll =-342.1899067419255
iteration 4 ll =-713.1502961947745
iteration 5 ll =-682.9140601167718
iteration 6 ll =-697.1778161403608
iteration 7 ll =-849.3698953575367
iteration 8 ll =-624.5082349240688
iteration 9 ll =-944.057512837912
iteration 10 ll =-548.5626504840673
iteration 11 ll =-1045.7770126711573
iteration 12 ll =-461.0481052739916
iteration 13 ll =-1148.3643542387251
iteration 14 ll =-343.51935108250876
iteration 15 ll =-1191.0430220468681
iteration 16 ll =-208.69480983108247
iteration 17 ll =-1071.6711135309235
iteration 18 ll =-415.8678382868238
iteration 19 ll =-1153.2238576167967
iteration 20 ll =-220.64674183745785
iteration 21 ll =-1153.230816390588
iteration 22 ll =-193.37483769591438
iteration 23 ll =-960.6948061509682
iteration 24 ll =-472.0101912029022
iteration 25 ll =-1065.0995331842228
iteration 26 ll =-331.6572197770391
iteration 27 ll =-1130.3057951

In [161]:
yx_ = np.asarray(yx)

In [162]:
W = initializeW(hx)

In [163]:
for x in range(1000):
    yh = getProbabilities(hx,W)
    indicator = (yx_==1)
    error = (indicator-yh)
    for j,_ in enumerate(W):
        gradient_j = getGradient_i(error,hx[:,j])
        W[j] = W[j] + 0.0001*gradient_j
    #ll = compute_log_likelihood(hx,yx,W)
    ll = getLogLikelihood(yx,yh)
    print('iteration ' + str(x) + ' ll =' + str(ll)) 

iteration 0 ll =-741.4343712919531
iteration 1 ll =-680.5576978686058
iteration 2 ll =-246.21957309510128
iteration 3 ll =-679.6204962383571
iteration 4 ll =-651.195028218994
iteration 5 ll =-652.8151662308499
iteration 6 ll =-810.0976972511938
iteration 7 ll =-570.0441214687778
iteration 8 ll =-901.527252259717
iteration 9 ll =-477.2052815891355
iteration 10 ll =-1011.2034400538984
iteration 11 ll =-379.6118041321562
iteration 12 ll =-1104.6580298337205
iteration 13 ll =-289.60156548706647
iteration 14 ll =-1131.9985167717332
iteration 15 ll =-248.6841062944815
iteration 16 ll =-1091.4704081756418
iteration 17 ll =-271.5320644450271
iteration 18 ll =-1108.2765890921078
iteration 19 ll =-239.86373194261174
iteration 20 ll =-1067.53610048954
iteration 21 ll =-279.7655721898195
iteration 22 ll =-1085.7672285123888
iteration 23 ll =-229.4880917139254
iteration 24 ll =-1039.5805044636604
iteration 25 ll =-280.4512492172867
iteration 26 ll =-1063.7752893523718
iteration 27 ll =-218.69806208

using Functions

In [165]:
target = '1'
others = ['2']

In [166]:
binarized_data, yx,y = binarizeSampleData(target,others,data)

In [167]:
HXsample,YXsample = binarized_data[:100],yx[:100]
yx = [int(y_) for y_ in YXsample]

In [168]:
hx = getFeatures(HXsample)

In [169]:
y_h,ll = LogisticRegression(1000,hx,yx,True)

iteration 0 ll =-742.4877133733559
iteration 1 ll =-715.2856382726836
iteration 2 ll =-354.8521411828229
iteration 3 ll =-1198.8094232464202
iteration 4 ll =-399.64098625036297
iteration 5 ll =-1211.5545700197852
iteration 6 ll =-372.3119060892929
iteration 7 ll =-1207.8725498495762
iteration 8 ll =-353.7088607007291
iteration 9 ll =-1202.668850124149
iteration 10 ll =-348.95536963096475
iteration 11 ll =-1196.228350492097
iteration 12 ll =-332.0604867404738
iteration 13 ll =-1188.523216035317
iteration 14 ll =-329.91367725018574
iteration 15 ll =-1180.3252365299202
iteration 16 ll =-314.71227155957706
iteration 17 ll =-1171.4896224661682
iteration 18 ll =-301.4417838884745
iteration 19 ll =-1161.7910274664098
iteration 20 ll =-290.0471356808265
iteration 21 ll =-1151.710309057709
iteration 22 ll =-280.21228981113694
iteration 23 ll =-1141.128169987752
iteration 24 ll =-271.8655472720127
iteration 25 ll =-1130.1138507083742
iteration 26 ll =-264.69189133343457
iteration 27 ll =-1118.69