In [76]:
import random

import numpy as np
import pandas as pd
import sklearn
import sklearn.preprocessing
from sklearn.svm import SVC

random.seed(42)
np.random.seed(42)

train_df = pd.read_pickle("train.pkl")
test_df = pd.read_pickle("test.pkl")
validation_df = pd.read_pickle("validation.pkl")
onehot = sklearn.preprocessing.OneHotEncoder()


X_train = train_df["feature"].values
X_train = np.stack(X_train)
Y_train = train_df["meta"].values
X_val = validation_df["feature"].values
X_val = np.stack(X_val)
Y_val = validation_df["meta"].values
X_test = test_df["feature"].values
X_test = np.stack(X_test)
Y_test = test_df["meta"].values


clf = SVC(probability=True)
clf.fit(X_train, Y_train)
result_val = clf.predict_proba(X_val)
Y_val_onehot = onehot.fit_transform(Y_val.reshape(-1, 1)).toarray()
f1_val = sklearn.metrics.f1_score(Y_val, result_val.argmax(axis=1), average="macro")
auc_val = sklearn.metrics.roc_auc_score(
    Y_val_onehot, result_val, average="macro", multi_class="ovr"
)
print(f"f1_val: {f1_val:.4f}, auc_val: {auc_val:.4f}")
classification_report = sklearn.metrics.classification_report(
    Y_val, result_val.argmax(axis=1)
)
print(classification_report)
confusion_matrix = sklearn.metrics.confusion_matrix(Y_val, result_val.argmax(axis=1))
print(confusion_matrix)

result_test = clf.predict_proba(X_test)
Y_test_onehot = onehot.fit_transform(Y_test.reshape(-1, 1)).toarray()
f1_test = sklearn.metrics.f1_score(Y_test, result_test.argmax(axis=1), average="macro")
auc_test = sklearn.metrics.roc_auc_score(
    Y_test_onehot, result_test, average="macro", multi_class="ovr"
)
print(f"f1_test: {f1_test:.4f}, auc_test: {auc_test:.4f}")

classification_report = sklearn.metrics.classification_report(
    Y_test, result_test.argmax(axis=1)
)
print(classification_report)
confusion_matrix = sklearn.metrics.confusion_matrix(Y_test, result_test.argmax(axis=1))
print(repr(confusion_matrix))






f1_val: 0.4832, auc_val: 0.8820
              precision    recall  f1-score   support

           0       0.45      0.15      0.22        96
           1       0.44      0.50      0.47        96
           2       0.56      0.88      0.69        96
           3       0.64      0.60      0.62        96
           4       0.77      0.93      0.84        96
           5       0.25      0.03      0.06        96
           6       0.39      0.67      0.49        96

    accuracy                           0.54       672
   macro avg       0.50      0.54      0.48       672
weighted avg       0.50      0.54      0.48       672

[[14 27  8  6  0  4 37]
 [ 0 48 44  3  0  0  1]
 [ 1 11 84  0  0  0  0]
 [ 0  0  1 58 26  4  7]
 [ 0  0  0  7 89  0  0]
 [ 9 11  6 11  1  3 55]
 [ 7 12  6  6  0  1 64]]
f1_test: 0.4115, auc_test: 0.8559
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       120
           1       0.40      0.88      0.55       120
     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [65]:
test_df = pd.read_pickle("test.pkl")

# collapse data
mapping_dict = {
    0: 0,
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 3,
    6: 3,
}
# train_df["meta"] = train_df["meta"].map(mapping_dict)
test_df["meta"] = test_df["meta"].map(mapping_dict)
X_test = test_df["feature"].values
X_test = np.stack(X_test)
Y_test = test_df["meta"].values

print(test_df["meta"].value_counts())
# result_test = clf.predict_proba(X_test)



1    240
2    240
3    240
0    120
Name: meta, dtype: int64


In [67]:

# merge result_test according to the mapping_dict


new_result = np.zeros((result_test.shape[0], 4))
for i in range(7):
    print(i, mapping_dict[i])
    new_result[:,mapping_dict[i]] += result_test[:,i]
#print(new_result)
#print(new_result.sum(axis=1))
    
Y_test_onehot = onehot.fit_transform(Y_test.reshape(-1, 1)).toarray()
print(Y_test_onehot)
print(Y_test_onehot.sum(axis=0))
f1_test = sklearn.metrics.f1_score(Y_test, new_result.argmax(axis=1), average="macro")
auc_test = sklearn.metrics.roc_auc_score(
    Y_test_onehot, new_result, average="macro", multi_class="ovr"
)
print(f"4 classes f1_test: {f1_test:.4f}, auc_test: {auc_test:.4f}")
classification_report = sklearn.metrics.classification_report(
    Y_test, new_result.argmax(axis=1)
)
print(classification_report)
confusion_matrix = sklearn.metrics.confusion_matrix(Y_test, new_result.argmax(axis=1))
print(repr(confusion_matrix))

0 0
1 1
2 1
3 2
4 2
5 3
6 3
[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 ...
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]]
[120. 240. 240. 240.]
4 classes f1_test: 0.5544, auc_test: 0.8747
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       120
           1       0.81      0.97      0.89       240
           2       0.97      0.54      0.70       240
           3       0.50      0.88      0.64       240

    accuracy                           0.68       840
   macro avg       0.57      0.60      0.55       840
weighted avg       0.65      0.68      0.63       840

array([[  0,  20,   0, 100],
       [  0, 233,   0,   7],
       [  0,   7, 130, 103],
       [  0,  26,   4, 210]], dtype=int64)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [73]:
array = np.array([[  0,  38,   4,   6,   0,  47,  25],
       [  0, 106,  10,   0,   0,   4,   0],
       [  0,  37,  83,   0,   0,   0,   0],
       [  0,   5,   0,  50,   0,  60,   5],
       [  0,   1,   6,  44,  68,   0,   1],
       [  0,  28,   4,   2,   0,  61,  25],
       [  0,  32,   5,   4,   0,  57,  22]], dtype=np.int64)
new_result = np.zeros((array.shape[0], 4))
for i in range(7):
    #print(i, mapping_dict[i])
    new_result[:,mapping_dict[i]] += array[:,i]
array2 = new_result
new_result = np.zeros((4, 4))
for i in range(7):
    #print(i, mapping_dict[i])
    new_result[mapping_dict[i],:] += array2[i,:]
print(repr(new_result))


array([[  0.,  42.,   6.,  72.],
       [  0., 236.,   0.,   4.],
       [  0.,  12., 162.,  66.],
       [  0.,  69.,   6., 165.]])


In [47]:
import random

import numpy as np
import pandas as pd
import sklearn
import sklearn.preprocessing
from sklearn.svm import SVC

random.seed(42)

train_df = pd.read_pickle("train.pkl")
test_df = pd.read_pickle("test.pkl")
validation_df = pd.read_pickle("validation.pkl")
onehot = sklearn.preprocessing.OneHotEncoder()

# collapse data
mapping_dict = {
    0: 0,
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 3,
    6: 3,
}
train_df["meta"] = train_df["meta"].map(mapping_dict)
test_df["meta"] = test_df["meta"].map(mapping_dict)
validation_df["meta"] = validation_df["meta"].map(mapping_dict)

X_train = train_df["feature"].values
X_train = np.stack(X_train)
Y_train = train_df["meta"].values
X_val = validation_df["feature"].values
X_val = np.stack(X_val)
Y_val = validation_df["meta"].values
X_test = test_df["feature"].values
X_test = np.stack(X_test)
Y_test = test_df["meta"].values

clf = SVC(probability=True)
clf.fit(X_train, Y_train)
result_val = clf.predict_proba(X_val)
Y_val_onehot = onehot.fit_transform(Y_val.reshape(-1, 1)).toarray()
f1_val = sklearn.metrics.f1_score(Y_val, result_val.argmax(axis=1), average="macro")
auc_val = sklearn.metrics.roc_auc_score(
    Y_val_onehot, result_val, average="macro", multi_class="ovr"
)
print(f"f1_val: {f1_val:.4f}, auc_val: {auc_val:.4f}")
classification_report = sklearn.metrics.classification_report(
    Y_val, result_val.argmax(axis=1)
)
print(classification_report)
confusion_matrix = sklearn.metrics.confusion_matrix(Y_val, result_val.argmax(axis=1))
print(confusion_matrix)

result_test = clf.predict_proba(X_test)
Y_test_onehot = onehot.fit_transform(Y_test.reshape(-1, 1)).toarray()
f1_test = sklearn.metrics.f1_score(Y_test, result_test.argmax(axis=1), average="macro")
auc_test = sklearn.metrics.roc_auc_score(
    Y_test_onehot, result_test, average="macro", multi_class="ovr"
)
print(f"f1_test: {f1_test:.4f}, auc_test: {auc_test:.4f}")

classification_report = sklearn.metrics.classification_report(
    Y_test, result_test.argmax(axis=1)
)
print(classification_report)
confusion_matrix = sklearn.metrics.confusion_matrix(Y_test, result_test.argmax(axis=1))
print(repr(confusion_matrix))


f1_val: 0.6302, auc_val: 0.9189
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        96
           1       0.82      0.98      0.90       192
           2       0.98      0.84      0.91       192
           3       0.61      0.88      0.72       192

    accuracy                           0.77       672
   macro avg       0.60      0.68      0.63       672
weighted avg       0.69      0.77      0.72       672

[[  0  22   0  74]
 [  0 188   0   4]
 [  0   0 162  30]
 [  1  18   4 169]]
f1_test: 0.5643, auc_test: 0.8846
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       120
           1       0.85      0.96      0.90       240
           2       0.97      0.55      0.70       240
           3       0.51      0.91      0.65       240

    accuracy                           0.69       840
   macro avg       0.58      0.61      0.56       840
weighted avg       0.67      0.69      0.6

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
result_merged=  result_test.argmax(axis=1)
result_merged = np.array([mapping_dict[i] for i in result_merged])
f1_test = sklearn.metrics.f1_score(Y_test, result_merged, average="macro")
print(f"f1_test: {f1_test:.4f}")
classification_report = sklearn.metrics.classification_report(
    Y_test, result_merged,digits = 4
)
print(classification_report)


f1_test: 0.5435
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       120
           1     0.6574    0.9833    0.7880       240
           2     0.9310    0.6750    0.7826       240
           3     0.5375    0.6875    0.6033       240

    accuracy                         0.6702       840
   macro avg     0.5315    0.5865    0.5435       840
weighted avg     0.6074    0.6702    0.6211       840



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
