In [181]:
import numpy as np
import pandas as pd
import cvxpy as cp

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

from sklearn.base import BaseEstimator, ClassifierMixin

In [182]:
import warnings
warnings.filterwarnings('ignore')

### HR


In [239]:
data = pd.read_csv('dataHR.csv')

target_column = 'Attrition'
columns_to_dummies = ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus', ]
data['OverTime'] = (data['OverTime'] == 'Yes').astype(int)
data['Gender'] = (data['Gender'] == 'Female').astype(int)
data['Over18'] = (data['Over18'] == 'Y').astype(int)

y = data[target_column]
X = data.loc[:, data.columns != target_column]
X = pd.get_dummies(X, dtype=float, columns=columns_to_dummies, drop_first=True)


y.replace('Yes', 1, inplace=True)
y.replace('No', -1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [240]:
# logreg
logreg = LogisticRegression().fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print(f1_score(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

0.5384615384615384
0.9021739130434783


In [185]:
class SLIM(BaseEstimator, ClassifierMixin):

    def __init__(self, C0=0.05, C1=None, values = np.array([-5, -1, 1, 5]), intercept_values=None, w_pos=1.):
        self.C0 = C0
        self.C1 = C1
        self.values = values
        self.w_pos = w_pos
        if intercept_values is not None:
            self.intercept_values = intercept_values
        else:
            self.intercept_values = values


    def fit(self, X, y, **solver_flags):
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
            y = y.to_numpy()

        N = len(y)
        S = X * y[:, None]

        w = np.ones(len(y))
        w[y == 1] *= self.w_pos

        L = 100

        lamb = cp.Variable((X.shape[1], len(self.values)), boolean=True)
        intercept_lamb = cp.Variable(len(self.intercept_values), boolean=True)
        #beta = cp.Variable(X.shape[1], boolean=True)

        cons = [lamb.sum(1) <= 1, intercept_lamb.sum() <= 1]

        loss = cp.sum(cp.multiply(w, cp.pos(1-S @ (lamb @ self.values + intercept_lamb @ self.intercept_values))))

        if self.C1:
            gamma = cp.Variable(X.shape[1], nonneg=True)
            reg = self.C0 * cp.sum(lamb) + self.C1 * cp.sum(gamma)
            cons.extend([lamb @ self.values <= gamma, lamb @ self.values >= -gamma])
        else:
            reg = self.C0 * cp.sum(lamb)



        obj = cp.Minimize(loss / w.sum() + reg)
        problem = cp.Problem(obj, cons)
        problem.solve(**solver_flags)
        self.params_ = np.rint(lamb.value @ self.values)
        self.intercept_ = np.rint(intercept_lamb.value @ self.intercept_values)


    def predict(self,X):
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        y = np.ones(len(X))
        y[X @ self.params_ + self.intercept_ > 0] = 1
        y[X @ self.params_ + self.intercept_ <= 0] = -1
        return y

In [None]:
ub = 1 /  (X_train.shape[0] * X_train.shape[1])
values = np.array([1, 2, 3])
values = np.hstack([-values, values])
intercept_values = np.arange(1, 10)
intercept_values = np.hstack([-intercept_values, intercept_values])

model = SLIM(C0=ub, values=values, intercept_values=intercept_values,w_pos=4)
model.fit(X_train, y_train, solver='SCIP', verbose=True)

In [230]:
def print_meaningful_coefs(names, coefs):
    for name, coef in zip(names, coefs):
        if coef != 0.:
            print(f"{name} = {coef}")

In [231]:
print_meaningful_coefs(X_train.columns, model.params_)

Over18 = -3.0
OverTime = 1.0
BusinessTravel_Travel_Frequently = 1.0
BusinessTravel_Travel_Rarely = 1.0
Department_Research & Development = 2.0
Department_Sales = 2.0
EducationField_Life Sciences = -2.0
EducationField_Marketing = -2.0
EducationField_Medical = -2.0
EducationField_Other = -2.0
EducationField_Technical Degree = -1.0
JobRole_Human Resources = 3.0
JobRole_Laboratory Technician = 2.0
JobRole_Research Scientist = 1.0
JobRole_Sales Executive = 1.0
JobRole_Sales Representative = 2.0
MaritalStatus_Single = 1.0


In [233]:
y_pred = model.predict(X_test)

print(f1_score(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

0.39316239316239315
0.8070652173913043


### Students

In [242]:
data = pd.read_csv('dataStudents.csv')
columns_to_dummies = ['Ethnicity']
target_column = 'GradeClass'

y = data[target_column]
X = data.loc[:, data.columns != target_column]
X = pd.get_dummies(X, dtype=float, columns=columns_to_dummies, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [244]:
# logreg
logreg = LogisticRegression().fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print(f1_score(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

0.8930390492359933
0.8946488294314381


In [None]:
ub = 1 /  (X_train.shape[0] * X_train.shape[1])
values = np.array([1, 2, 3])
values = np.hstack([-values, values])
intercept_values = np.arange(1, 10)
intercept_values = np.hstack([-intercept_values, intercept_values])

model = SLIM(C0=ub/5, values=values, intercept_values=intercept_values,w_pos=1.)
model.fit(X_train, y_train, solver='SCIP', verbose=True)

In [246]:
print_meaningful_coefs(X_train.columns, model.params_)

Age = -1.0
Gender = 1.0
ParentalEducation = 1.0
Absences = 1.0
Tutoring = -2.0
Extracurricular = -1.0
Sports = -1.0
Volunteering = 2.0
Ethnicity_1 = 1.0


In [220]:
y_pred = model.predict(X_test)

print(f1_score(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

0.6576879910213244
0.4899665551839465


### Breast Cancer

In [295]:
data = pd.read_csv('dataCancer.csv')
target_column = 'class'
y = data[target_column]
X = data.loc[:, data.columns != target_column]
y = 1 * (y == 4) + (-1) * (y == 2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [296]:
# logreg
logreg = LogisticRegression().fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print(f1_score(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

0.9384615384615386
0.9532163742690059


In [None]:
ub = 1 /  (X_train.shape[0] * X_train.shape[1])
values = np.array([1, 2, 3, 4, 5])
values = np.hstack([-values, values])
intercept_values = np.arange(1, 20)
intercept_values = np.hstack([-intercept_values, intercept_values])

model = SLIM(C0=ub, values=values, intercept_values=intercept_values,w_pos=1.)
model.fit(X_train, y_train, solver='SCIP', verbose=True)

In [310]:
print_meaningful_coefs(X_train.columns, model.params_)

size_uniformity = 1.0
epithelial_size = -1.0
bare_nucleoli = 1.0
bland_chromatin = -1.0
normal_nucleoli = 1.0
mitoses = -1.0


In [311]:
y_pred = model.predict(X_test)

print(f1_score(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

0.8372093023255813
0.8771929824561403
