In [1]:
import numpy as np
import random
from gurobipy import *

# The first parameter is location of the data
# Second is class column number
# The last is class identifier

def robust_model(location, cc, ci):
    
    ## Pretreat data
    # Open data
    data = np.loadtxt(location, delimiter = ',', dtype = str)

    # If breast cancer data, delete id column
    if 'breast' in location:
        data = data[:,1:]
    
    # Delete the samples including'?'
    L = len(data) # The number of sample
    l = len(data[0]) - 1 # Dimension of data
    
    x = []
    
    for i in range(L):
        x.append(i)
        for j in range(l):
            if data[i,j] == '?':
                x.remove(i)

    data = data[x].astype('float64') # Converse string to float64

    # Modify after remove all samples including '?'
    L = len(data) # The numer of sample
    l = len(data[0]) - 1 # The number of columns excluding class identifier
    
    # Divide the data to train set and test set by 8 : 2
    np.random.shuffle(data)
    train = data[:int((len(data)) * .8)]
    test = data[len(train):]
    
    ## Start train
    Atrain, Btrain = [], []

    # Divide train set into class ci and the others
    for i in range(len(train)):
        if train[i,cc] == ci:
            Atrain.append(train[i])
        else :
            Btrain.append(train[i])

    Atrain, Btrain = np.array(Atrain), np.array(Btrain)

    # Declare model with gurobi
    m = Model()
    
    # Delete the result of model optimizing
    m.setParam(GRB.Param.OutputFlag, 0)

    # Declare variables
    w, y, z = [], [] ,[]
    r = m.addVar(vtype = GRB.CONTINUOUS, name = 'r')

    # Collect all errors
    lA, lB = len(Atrain), len(Btrain) # the number of samples of Atrain and Btrain

    for i in range(lA):
        y.append(m.addVar(vtype = GRB.CONTINUOUS, name = 'y[{}]'.format(i), lb = 0))

    for i in range(lB):
        z.append(m.addVar(vtype = GRB.CONTINUOUS, name = 'z[{}]'.format(i), lb = 0))

    for i in range(l):
        w.append(m.addVar(vtype = GRB.CONTINUOUS, name = 'w[{}]'.format(i), lb = -100, ub = 100))

    # Update model
    m.update()

    # Sum of errors
    Y = sum(y[i] for i in range(lA))
    Z = sum(z[i] for i in range(lB))

    # Object function (Minimize sum of mean of errors)
    m.setObjective(Y/lA + Z/lB , GRB.MINIMIZE)
    m.update()

    # Constraint
    smallnum = 1.e-3

    # y constraint
    for i in range(lA):
        m.addConstr(sum(Atrain[i,j] * w[j] for j in range(l)) - r + y[i] >= smallnum, name = 'cty[{}]'.format(i))

    m.update()

    # z constraint
    for i in range(lB):
        m.addConstr(sum(Btrain[i,j] * w[j] for j in range(l))- r - z[i] <= -smallnum, name = 'ctz[{}]'.format(i))

    # Update and Optimize model
    m.update()
    m.optimize()
    
    ## Start test
    Atest, Btest = [], []

    for i in range(len(test)):
        if test[i,cc] == ci:
            Atest.append(test[i])
        else: 
            Btest.append(test[i])

    Atest, Btest = np.array(Atest), np.array(Btest)

    fail = 0
    for i in range(len(Atest)):
        if(sum(Atest[i,j] * w[j].x for j in range(l)) < r.x):
            fail += 1

    for i in range(len(Btest)):
        if(sum(Btest[i,j] * w[j].x for j in range(l)) > r.x):
            fail += 1

    success_rate = (len(test) - fail) * 100 / len(test)
    print()
    print('The success rate of classification by class identifier', ci, 'and the others:', success_rate, '%')

# Classify breast cancer
location = 'data/breast-cancer-wisconsin.data'
robust_model(location, -1, 2)
robust_model(location, -1, 4)

# Classify wine
location = 'data/wine.data'
robust_model(location, 0, 1)
robust_model(location, 0, 2)
robust_model(location, 0, 3)

# Classify heart disease
location = 'data/processed.cleveland.data'
robust_model(location, -1, 0)
robust_model(location, -1, 1)
robust_model(location, -1, 2)
robust_model(location, -1, 3)
robust_model(location, -1, 4)

Academic license - for non-commercial use only

The success rate of classification by class identifier 2 and the others: 87.5912408759124 %

The success rate of classification by class identifier 4 and the others: 95.62043795620438 %

The success rate of classification by class identifier 1 and the others: 94.44444444444444 %

The success rate of classification by class identifier 2 and the others: 83.33333333333333 %

The success rate of classification by class identifier 3 and the others: 97.22222222222223 %

The success rate of classification by class identifier 0 and the others: 81.66666666666667 %

The success rate of classification by class identifier 1 and the others: 61.666666666666664 %

The success rate of classification by class identifier 2 and the others: 63.333333333333336 %

The success rate of classification by class identifier 3 and the others: 76.66666666666667 %

The success rate of classification by class identifier 4 and the others: 88.33333333333333 %
