In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import time
from scipy.optimize import minimize, differential_evolution
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_table('wilt/training.csv', sep=',')

In [3]:
df.head()

Unnamed: 0,class,GLCM_pan,Mean_Green,Mean_Red,Mean_NIR,SD_pan
0,w,120.362774,205.5,119.395349,416.581395,20.676318
1,w,124.739583,202.8,115.333333,354.333333,16.707151
2,w,134.691964,199.285714,116.857143,477.857143,22.496712
3,w,127.946309,178.368421,92.368421,278.473684,14.977453
4,w,135.431548,197.0,112.690476,532.952381,17.604193


In [4]:
df['class'].value_counts()

n    4265
w      74
Name: class, dtype: int64

In [5]:
from sklearn.preprocessing import LabelEncoder
for col in ['class']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [6]:
df.head()

Unnamed: 0,class,GLCM_pan,Mean_Green,Mean_Red,Mean_NIR,SD_pan
0,1,120.362774,205.5,119.395349,416.581395,20.676318
1,1,124.739583,202.8,115.333333,354.333333,16.707151
2,1,134.691964,199.285714,116.857143,477.857143,22.496712
3,1,127.946309,178.368421,92.368421,278.473684,14.977453
4,1,135.431548,197.0,112.690476,532.952381,17.604193


In [7]:
X = df.drop(columns=['class']).values
y = df['class'].values

In [8]:
for i in range(X.shape[1]):
    X[:,i]=(X[:,i]-X[:,i].min())/(X[:,i].max()-X[:,i].min())

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [10]:
X_train.shape

(3037, 5)

In [11]:
y_train.sum() / len(y_train)

0.017122160026341784

In [30]:
from Seniority_committee import Seniority_committee
sc = Seniority_committee(N=30)
sc.fit(X_train, y_train, optim_method='TNC', verbose=2)

Making hyperplane number 1
X_train.shape[0] = 3037
k = 10
L = 1024
Optimizing hyperplane for class 1
Optimization is started
First approximation is obtained
The minimum of the loss function: 0.0
Time taken for optimization: 0.38399529457092285
Optimizing hyperplane for class 0
Optimization is started
First approximation is obtained
The minimum of the loss function: -2606.969473774156
Time taken for optimization: 4.487003564834595
X_1.shape[0] = 0
X_0.shape[0] = 2818

Making hyperplane number 2
X_train.shape[0] = 219
k = 10
L = 1024
Optimizing hyperplane for class 1
Optimization is started
First approximation is obtained
The minimum of the loss function: 0.0
Time taken for optimization: 0.15400147438049316
Optimizing hyperplane for class 0
Optimization is started
First approximation is obtained
The minimum of the loss function: -12.727588919152458
Time taken for optimization: 0.81103515625
X_1.shape[0] = 0
X_0.shape[0] = 15
Cutted data shape is not enough

k = 9
L = 512
Optimizing hyper

In [31]:
sc.weights_hp

{1: (0,
  array([  1.4007776 ,  87.06844956, -58.89102182,   0.62848354,
           2.19159692,  -1.99625411]),
  0.0,
  0.2374429223744292),
 2: (0,
  array([ 0.96033278, 11.77191693,  5.71801112, -7.13669345, -2.89667295,
          0.12175999]),
  0.0,
  0.2857142857142857),
 3: (0,
  array([ -4.3606384 ,   5.63077474, -40.44188719,  -1.10189328,
           7.37122404,   4.37621621]),
  0.07142857142857142,
  0.7678571428571429)}

In [32]:
train_proba = sc.predict_proba(X_train)
test_proba = sc.predict_proba(X_test)
train_preds = sc.predict(X_train)
test_preds = sc.predict(X_test)
print('Train Gini = {}'.format(2 * roc_auc_score(y_train, train_proba) - 1))
print('Test Gini = {}'.format(2 * roc_auc_score(y_test, test_proba) - 1))
print('Train F1 = {}'.format(f1_score(y_train, train_preds)))
print('Test F1 = {}'.format(f1_score(y_test, test_preds)))
print('Train accuracy = {}'.format(accuracy_score(y_train, train_preds)))
print('Test accuracy = {}'.format(accuracy_score(y_test, test_preds)))

Time taken to predict the targets: 1.9399940967559814
Time taken to predict the targets: 0.7860074043273926
Time taken to predict the targets: 1.7759959697723389
Time taken to predict the targets: 0.7940027713775635
Train Gini = 0.988107202680067
Test Gini = 0.9468394886363638
Train F1 = 0.7962962962962962
Test F1 = 0.8510638297872342
Train accuracy = 0.9927560092196246
Test accuracy = 0.9946236559139785


In [14]:
tree = DecisionTreeClassifier(max_depth=17)
tree.fit(X_train, y_train)
train_proba = tree.predict_proba(X_train)
test_proba = tree.predict_proba(X_test)
train_preds = tree.predict(X_train)
test_preds = tree.predict(X_test)
print('Train Gini = {}'.format(2 * roc_auc_score(y_train, train_proba[:, 1]) - 1))
print('Test Gini = {}'.format(2 * roc_auc_score(y_test, test_proba[:, 1]) - 1))
print('Train F1 = {}'.format(f1_score(y_train, train_preds)))
print('Test F1 = {}'.format(f1_score(y_test, test_preds)))
print('Train Accuracy = {}'.format(accuracy_score(y_train, train_preds)))
print('Test Accuracy = {}'.format(accuracy_score(y_test, test_preds)))

Train Gini = 1.0
Test Gini = 0.49765625000000013
Train F1 = 1.0
Test F1 = 0.6111111111111112
Train Accuracy = 1.0
Test Accuracy = 0.989247311827957


In [27]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
train_proba = logreg.predict_proba(X_train)
test_proba = logreg.predict_proba(X_test)
train_preds = logreg.predict(X_train)
test_preds = logreg.predict(X_test)
print('Train Gini = {}'.format(2 * roc_auc_score(y_train, train_proba[:, 1]) - 1))
print('Test Gini = {}'.format(2 * roc_auc_score(y_test, test_proba[:, 1]) - 1))
print('Train F1 = {}'.format(f1_score(y_train, train_preds)))
print('Test F1 = {}'.format(f1_score(y_test, test_preds)))
print('Train Accuracy = {}'.format(accuracy_score(y_train, train_preds)))
print('Test Accuracy = {}'.format(accuracy_score(y_test, test_preds)))

Train Gini = 0.529364772580853
Test Gini = 0.4875710227272727
Train F1 = 0.0
Test F1 = 0.0
Train Accuracy = 0.9828778399736582
Test Accuracy = 0.9831029185867896


In [28]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
train_proba = rf.predict_proba(X_train)
test_proba = rf.predict_proba(X_test)
train_preds = rf.predict(X_train)
test_preds = rf.predict(X_test)
print('Train Gini = {}'.format(2 * roc_auc_score(y_train, train_proba[:, 1]) - 1))
print('Test Gini = {}'.format(2 * roc_auc_score(y_test, test_proba[:, 1]) - 1))
print('Train F1 = {}'.format(f1_score(y_train, train_preds)))
print('Test F1 = {}'.format(f1_score(y_test, test_preds)))
print('Train Accuracy = {}'.format(accuracy_score(y_train, train_preds)))
print('Test Accuracy = {}'.format(accuracy_score(y_test, test_preds)))

Train Gini = 1.0
Test Gini = 0.9919034090909093
Train F1 = 1.0
Test F1 = 0.5806451612903226
Train Accuracy = 1.0
Test Accuracy = 0.9900153609831029


In [29]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)
train_proba = lgbm.predict_proba(X_train)
test_proba = lgbm.predict_proba(X_test)
train_preds = lgbm.predict(X_train)
test_preds = lgbm.predict(X_test)
print('Train Gini = {}'.format(2 * roc_auc_score(y_train, train_proba[:, 1]) - 1))
print('Test Gini = {}'.format(2 * roc_auc_score(y_test, test_proba[:, 1]) - 1))
print('Train F1 = {}'.format(f1_score(y_train, train_preds)))
print('Test F1 = {}'.format(f1_score(y_test, test_preds)))
print('Train Accuracy = {}'.format(accuracy_score(y_train, train_preds)))
print('Test Accuracy = {}'.format(accuracy_score(y_test, test_preds)))

Train Gini = 1.0
Test Gini = 0.993465909090909
Train F1 = 1.0
Test F1 = 0.7567567567567568
Train Accuracy = 1.0
Test Accuracy = 0.9930875576036866
