In [201]:
from __future__ import division
import csv
import pandas as pd
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import random
import math
import time

In [2]:
def openCSVFile(file):
    f = open(file)
    csvreader = csv.csvreader(f)
    file_data = list(csvreader)
    return file_data

In [3]:
def tokenizeData(data):
    split_data = []
    for row in data:
        split_data.append(row.split(" "))
    return split_data    

In [4]:
def cleanData(data,special_characters = [".",",",":",";","(",")","-","_"]):   
    data_ =[]
    for row in data:
        row= row.strip()
        for sc in special_characters:
            row = row.replace(sc,'')
        data_.append(row)
    return data_

In [5]:
def openFile(file,clean=False,tokenize=False):
    f = open(file)
    raw_data = f.read()        
    split_data = raw_data.split("\n")
    if(clean):
        split_data = cleanData(split_data)
    if(tokenize):
        split_data = tokenizeData(split_data)
    return split_data

In [6]:
def bag_of_words(data):
    bag_of_words = []
    for row in data:
        bag_of_words.append(Counter(row))
    return bag_of_words

Reading data

In [7]:
file = "DocumentClassificationTrainingData.txt"

In [8]:
data = openFile(file,True,True)

In [9]:
data = data[1:]

Defining Classificator functions for sample

In [56]:
def getFeatures(data_):
    lenghts = [len(row)for row in data_]
    bag = bag_of_words(data_)
    Df = pd.DataFrame(bag)
    Df = Df.fillna(0)
    Df['lenghts1'] = lenghts
    return Df.values

In [10]:
def binarizeLabels(target,other,data_):
    """
Function to binarize data (set 1 or 0 if it's in the target class)
    :param target: label of the target class, is a string
    :param other: label of the other class, this is a list of strings
    :param data: data to be binarized
    :return: binarized labels, true labels
    """
    yx = []
    y = []
    for row in data_:
        if row[0] == target or row[0] in other:
            if(row[0]== target):
                yx.append(1)
            else:
                yx.append(0)
            y.append(row[0])
    return yx,y

In [47]:
def binarizeSampleData(target,other,data_):
    """
Function to binarize data (set 1 or 0 if it's in the target class)
    :param target: label of the target class, is a string
    :param other: label of the other class, this is a list of strings
    :param data: data to be binarized
    :return: binarized data,binarized labels, true labels
    """
    hx = []
    yx = []
    y = []
    for row in data_:
        if row[0] == target or row[0] in other:
            if(row[0]== target):
                yx.append(1)
            else:
                yx.append(0)
            y.append(row[0])
            hx.append(row[1:])
    return hx,yx,y

In [12]:
def initializeW(hx):
    W = np.random.rand(len(hx[0]))
    return np.array(W)

In [13]:
def logistic(x):
    if(x < 0):
        return 1.0 - (1.0/(1+math.exp(x)))
    else:
        return (1.0/(1+math.exp(-x)))

In [14]:
def getProbabilities(hx,W):
    return np.asarray([logistic(np.dot(hx_i,W)) for hx_i in hx])

In [15]:
def getGradient_i(error,hx_j):    
    gradient = np.dot(error,hx_j)
    return gradient

In [16]:
def getLogLikelihood_i(y_i,yh_i):
    try:
        if(y_i==1):
            return math.log(yh_i)
        else:
            return math.log(1-yh_i)
    except Exception:
        return math.log(0.0000000001)

In [17]:
def compute_log_likelihood(hx, yx, W):
    indicator = (yx==+1)
    scores = np.dot(hx, W)
    logexp = np.log(1. + np.exp(-scores))
    
    # Simple check to prevent overflow
    mask = np.isinf(logexp)
    logexp[mask] = -scores[mask]
    
    lp = np.sum((indicator-1)*scores - logexp)
    return lp

In [18]:
def getLogLikelihood(yx,yh):
    return sum([getLogLikelihood_i(y_i,yh_i) for y_i,yh_i in list(zip(yx,yh))])

In [19]:
def setClass(Probs):
    return [round(yx) for yx in Probs]

In [221]:
def setMajorityClass(ProbsTable,headers):
    """
Function to set the predominant class to data rows.
    :param ProbsTable: matrix with the probabilities for each row to each class
    :param headers: array with column headers from the ProbsTable
    :return: Vector with class label for each row
    """
    headers = headers
    rowCls = []
    for row in ProbsTable:
        columnCls = 0
        maxProb = None
        for i,clsProb in enumerate(row):
            if maxProb is None or maxProb < clsProb:
                maxProb = clsProb
                columnCls = i
        rowCls.append(headers[columnCls])
    return rowCls

In [164]:
def LogisticRegression(maxIter,hx,yx,text=True):
    """
Function to implement Logistic Regression, takes feature matrix, true labels array (must be an array)
    :param maxIter: Max number of iterations
    :param hx: features matrix
    :param yx: true labels array
    :text: active text notifications to watch log likelihood trought iterations
    :return: probabilities (yh), logLikelihood(ll)
    """
    yx = np.asarray(yx)
    W = initializeW(hx)
    for x in range(maxIter):
        yh = getProbabilities(hx,W)
        indicator = (yx==1)
        error = indicator-yh
        for j,_ in enumerate(W):
            gradient_j = getGradient_i(error,hx[:,j])
            W[j] = W[j] + 0.0001*gradient_j
        #ll = compute_log_likelihood(hx,yx,W)
        ll = getLogLikelihood(yx,yh)
        if(text):
            print('iteration ' + str(x) + ' ll =' + str(ll))
    return yh,ll

In [228]:
def getPrecision(yx,yh):
    realPositives = 0
    falsePositives = 0
    for yi,yhi in zip(yx,yh):
        if yi == 1 and yhi == 1:
            realPositives+=1
        elif yi != 1 and yhi == 1:
            falsePositives+=1
    try:
        return realPositives / (realPositives + falsePositives)
    except Exception:
        return 0

In [229]:
def getRecall(yx,yh):
    realPositives = 0
    falseNegatives = 0
    for yi,yhi in zip(yx,yh):
        if yi == 1 and yhi == 1:
            realPositives+=1
        elif yi == 1 and yhi == 0:
            falseNegatives+=1
    try:
        return realPositives / (realPositives + falseNegatives)
    except Exception:
        return 0

In [236]:
def getClassificationError(C_h,y):
    C_h_ = np.asarray(C_h)
    error = (C_h_!=y)
    return error

Creating table to save probabilities for each target:

In [242]:
ProbPerClass = pd.DataFrame()
AllClasses = set(['1','2','3','4','5','6','7','8'])
for target in AllClasses:
    others = (AllClasses - set(target))
    binarized_data, yx_,y = binarizeSampleData(target,others,data)
    HXsample,YXsample = binarized_data[:5000],yx_[:5000]
    hx = getFeatures(HXsample)
    yx = YXsample
    y_h,ll = LogisticRegression(1500,hx,yx,False)
    ProbPerClass[target] = y_h
    y_hc = setClass(y_h)
    recall = getRecall(y_hc,yx)
    presicion = getPrecision(y_hc,yx)
    print('-------------------')
    print('Target : ' + str(target))
    print('Precision : ' + str(presicion))
    print('Recall : ' + str(recall))

-------------------
Target : 3
Precision : 0.9298245614035088
Recall : 0.8514056224899599
-------------------
Target : 2
Precision : 0.9806228373702423
Recall : 0.7319214876033058


KeyboardInterrupt: 

In [231]:
ProbPerClass['Label'] = setMajorityClass(ProbPerClass.values,ProbPerClass.columns)

In [232]:
ProbPerClass

Unnamed: 0,3,2,4,7,1,8,5,6,Label
0,0.000000e+00,1.000000,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,8.881784e-16,0.000000e+00,2
1,0.000000e+00,1.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2
2,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,4.174439e-13,0.000000e+00,1
3,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1
4,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,7.549517e-15,0.000000e+00,1
5,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1
6,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1
7,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1
8,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1
9,0.000000e+00,0.000000,6.090547e-07,7.105427e-15,1.000000e+00,0.000000e+00,6.263070e-06,0.000000e+00,1


In [238]:
error = getClassificationError(ProbPerClass['Label'].values,y[:4000])

In [240]:
sum(error)

590

In [241]:
590/4000

0.1475