In this notebook, we explain the ML strategy used to make the classification of users. We used a transformation method `Classifier Chains` with a base classifier, light GBM, a boosted-tree algorithm.

In [1]:
N = 4 # number of antennas in the BS
K = 10 # number of users (K >= N)
X0 = 0.500 # parameter of the uniform distribution of the user on the x-axis
Y0 = 0.500 # parameter of the uniform distribution of the user on the y-axis
nsize = 10000 # size of the dataset

In [2]:
from itertools import product, combinations
import numpy as np
from numpy.linalg import inv
import h5py
import pandas as pd
from PIL import Image
from numpy import linalg as LA
import os
from sklearn.model_selection import train_test_split
from PIL import Image
from matplotlib.pyplot import imshow
import seaborn as sns
from sklearn import metrics
from IPython.display import clear_output
%matplotlib inline
np.random.seed(42) # fix the seed for reproducibility

## Pre-processing

In [3]:
df = pd.read_csv('data.csv')

In [4]:
def read_matrices_sizes(matrix_name):
    with h5py.File('Data/' + matrix_name, 'r') as hf:
        mat = hf[matrix_name[:-3]][:]
    return mat

In [5]:
matrix_name = df['H'][0]
H = read_matrices_sizes(matrix_name)
matrix_name = df['P'][0]
P = read_matrices_sizes(matrix_name)
mat = H.dot(np.sqrt(P))
print('The shape of the matrix is', mat.shape)
mat.imag.flatten()

The shape of the matrix is (4, 10)


array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  4.50685925e-01, -7.99110965e-03,
       -9.15351277e-03, -2.19030685e+00, -4.84237626e+00, -7.31735971e+00,
        3.91558025e+00, -7.86676350e+00, -2.75607383e+00, -3.00867086e+00,
        8.67232976e-01,  1.59822045e-02,  1.83069974e-02,  4.05950376e+00,
        7.56442256e+00, -1.03855771e+01,  3.92323412e+00, -2.27582610e-01,
       -2.13146035e+00, -4.76236947e+00,  1.21808825e+00, -2.39732699e-02,
       -2.74604259e-02, -5.33355712e+00, -6.97423720e+00, -7.42295873e+00,
        1.53226998e-02,  7.86017962e+00,  1.10766980e+00, -4.52959570e+00])

In [6]:
def transform_data(df):
    with h5py.File('Data/' + df['H'], 'r') as hf:
        H = hf[df['H'][:-3]][:]
    with h5py.File('Data/' + df['P'], 'r') as hf:
        P = hf[df['P'][:-3]][:]
    N = H.shape[0]
    K = H.shape[1]
    x = np.zeros([N, K, 2])
    mat = H.dot(np.sqrt(P))
    x[:,:,0] = mat.real
    x[:,:,1] = mat.imag
    return x

In [7]:
df['Matrix'] = df.apply(transform_data,axis=1)

In [8]:
df['Matrix'][0].shape

(4, 10, 2)

In [9]:
def encoder(df):
    X = np.stack(df['Matrix'].values)
    cols = ['User' + str(i+1) for i in range(K)]
    y = df[cols].values
    return X, y

In [10]:
valid_size = 0.2
random_state = 42

In [11]:
train_df, valid_df = train_test_split(df, test_size=valid_size, random_state=random_state)

In [12]:
X_train, y_train = encoder(train_df)
X_valid, y_valid = encoder(valid_df)

In [13]:
X_train.shape, y_train.shape

((8000, 4, 10, 2), (8000, 10))

In [14]:
X_tr = np.zeros((X_train.shape[0],N*K*2))
for i in range(X_train.shape[0]):
    X_tr[i, :] = np.concatenate((X_train[i,:,:,0].flatten(), X_train[i,:,:,1].flatten()))

In [15]:
X_val = np.zeros((X_valid.shape[0],N*K*2))
for i in range(X_valid.shape[0]):
    X_val[i, :] = np.concatenate((X_valid[i,:,:,0].flatten(), X_valid[i,:,:,1].flatten()))

In [16]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_tr = sc.fit_transform(X_tr)
X_val= sc.transform(X_val)

## Test

In [17]:
test_df = pd.read_csv('test_data.csv')

In [18]:
test_df.shape

(2000, 12)

In [19]:
def transf_data(df):
    with h5py.File('test_Data/' + df['H'], 'r') as hf:
        H = hf[df['H'][:-3]][:]
    with h5py.File('test_Data/' + df['P'], 'r') as hf:
        P = hf[df['P'][:-3]][:]
    N = H.shape[0]
    K = H.shape[1]
    x = np.zeros([N, K, 2])
    mat = H.dot(np.sqrt(P))
    x[:,:,0] = mat.real
    x[:,:,1] = mat.imag
    return x

In [20]:
test_df['Matrix'] = test_df.apply(transf_data,axis=1)

In [21]:
X_test, y_test = encoder(test_df)

In [22]:
X_test.shape, y_test.shape

((2000, 4, 10, 2), (2000, 10))

In [23]:
X_te = np.zeros((X_test.shape[0],N*K*2))
for i in range(X_test.shape[0]):
    X_te[i, :] = np.concatenate((X_test[i,:,:,0].flatten(), X_test[i,:,:,1].flatten()))

In [24]:
X_te = sc.transform(X_te)

# Evaluation Metrics

In traditional classification, the standard evaluation criterion is the accuracy. In multilabel classification, a direct extension of the accuracy is the exact match ratio, which considers one instance as correct if and only if all associated labels are correctly predicted. However, this ratio may not be the most suitable performance measure as it does not count partial matches. The metrics can be divided in two categories:

-  Example-Based Evaluation Methods: exact math ratio, hamming loss, jaccard similarity index, etc,
-  Label-Based Evaluation Methods: $F_1$ score, precision, recall, etc.

**1. Exact Match Ratio**

Assume there are $L$ testing instances. Let $y^i$ be the true label vector of the $i^{th}$ instance and $\hat{y}^i$ be the predicted label vector, then the exact match ratio (EMR) is defined as
$$ EMR = \frac{1}{L} \sum_{i=1}^{L}{\mathbb{1}\left[\hat{y}^i = y^i\right]} $$

**2. Micro and Macro $F_1$ score, precision, and recall**

One of the most used performance measures for information retrieval systems is the F-measure, which is the harmonic mean of precision (P) and recall (R):
$$ F = \frac{2 P R}{P + R} $$

Precision is the measure of how much the method is immune to Type I error, i.e., falsely classifying negative cases as positives: false positives or FP. It is the fraction of correctly positively-classified cases (i.e., true positives) to all positively-classified cases.

Recall is the measure of how much the method is immune to the Type II error, i.e., falsely classifying positive cases as negatives: false negatives or FN. It is the fraction of correctly positively-classified cases (i.e., true positives) to all positively-classified label.

Due to a natural non-uniformity of the distribution of labels among input objects in any testing set. Two averaging techniques exist:

- Micro-averaging gives equal weight to every input object and performs a global aggregation of true/false positives/negatives, averaging over all objects first, 
- Macro-averaging, the measure is first calculated per label, then averaged over the number of labels. Macro averaging thus gives equal weight to each label, regardless of how often the label appears.

**3. Hamming loss**

Hamming loss is a label-wise decomposable function counting the fraction of labels that were misclassified.

**4. Jaccard Similarity Index**

Jaccard similarity is a measure of the size of similarity between the prediction and the ground truth comparing what is the cardinality of an intersection of the two, compared to the union of the two. In other words, what fraction of all labels taken into account by any of the prediction or ground truth were assigned to the observation in both of the cases.

# Models

In [25]:
from skmultilearn.problem_transform import LabelPowerset, ClassifierChain
from sklearn.metrics import accuracy_score, matthews_corrcoef, hamming_loss, jaccard_score, f1_score
from skmultilearn.problem_transform import BinaryRelevance

The `find_best_threshold` finds the best threshold, tuned in the validation stage, for the purpose of maximizing a certain criterion (one of the evaluation metrics).

In [26]:
def find_best_threshold(classifier, X, y):
    
    out = classifier.predict_proba(X)

    threshold = np.arange(0.1,0.9,0.1)

    acc = []
    accuracies = []
    best_threshold = np.zeros(out.shape[1])
    for i in range(out.shape[1]):
        y_prob = out[:,i]
        for j in threshold:
            y_pred = [1 if prob>=j else 0 for prob in y_prob]
            acc.append(accuracy_score(y[:,i],y_pred)) # criterion definition
        acc   = np.array(acc)
        index = np.where(acc==acc.max()) 
        accuracies.append(acc.max()) 
        best_threshold[i] = threshold[index[0][0]]
        acc = []
    return out, best_threshold

In [27]:
def scores(y1, y2):
    scores = [accuracy_score(y1, y2), hamming_loss(y1, y2), 
           jaccard_score(y1, y2, average='samples'), 
           f1_score(y1, y2, average='samples')]
    return scores

In [28]:
df_scores_valid = pd.DataFrame(columns=['Method', 'Accuracy', 'Hamming Loss', 'Jaccard Score', 'F_1 Score'])
df_scores_test = pd.DataFrame(columns=['Method', 'Accuracy', 'Hamming Loss', 'Jaccard Score', 'F_1 Score'])

In [29]:
def hermetian(H):
    return H.conj().T

In [30]:
def SNR1(H, P, sigma, i):
    K = H.shape[1]
    N = H.shape[0]
    P_i= P[i]
    h_i = H[:,i][:, np.newaxis]
    P = np.delete(P, i)
    set_minus_i = list(range(K))
    set_minus_i.pop(i)
    SNR = P_i*hermetian(h_i).dot(inv(H[:,set_minus_i].dot(np.diag(P)).dot(hermetian(H[:,set_minus_i])) + sigma**2*np.identity(N))).dot(h_i)
    return SNR.real

In [31]:
def SNR2(H, P, sigma, i, set_I1):
    K = H.shape[1]
    N = H.shape[0]
    P_i= P[i]
    set_0 = set_I1 + [i]
    h_i = H[:,i][:, np.newaxis]
    P = np.delete(P, set_0)
    set_minus_i = list(range(K))
    set_minus_i = [j for j in set_minus_i if j not in set_0]
    SNR = P_i*hermetian(h_i).dot(inv(H[:,set_minus_i].dot(np.diag(P)).dot(hermetian(H[:,set_minus_i])) + sigma**2*np.identity(N))).dot(h_i)
    return SNR.real

In [32]:
def sum_rate(df):
    sigma = 1
    with h5py.File('test_Data/' + df['H'], 'r') as hf:
        H = hf[df['H'][:-3]][:]
    with h5py.File('test_Data/' + df['P'], 'r') as hf:
        P = hf[df['P'][:-3]][:]
    P_diag = P.diagonal()
    N = H.shape[0]
    K = H.shape[1]
    cols = ['User'+str(i+1) for i in range(K)]
    conf = df[cols].values
    set_I1 = np.where(conf == 0)[0].tolist()
    set_I2 = np.where(conf == 1)[0].tolist()
    SNR_1 = [SNR1(H, P_diag, sigma, s) for s in set_I1] # SNR_i^(1) for i in I_1
    SNR_2 = [SNR2(H, P_diag, sigma, s, set_I1) for s in set_I2] # SNR_i^(2) for i in I_2
    R_1 = np.sum(np.log(1+np.array(SNR_1)))
    R_2 = np.sum(np.log(1+np.array(SNR_2)))
    return R_1 + R_2

In [33]:
pred_test_df = pd.DataFrame(columns=test_df.columns)
pred_test_df['H'] = test_df['H']
pred_test_df['P'] = test_df['P']
pred_test_df['Matrix'] = test_df['Matrix']

In [34]:
cols = ['User' + str(i+1) for i in range(K)]

In [37]:
import lightgbm as lgb
classifier = ClassifierChain(
    classifier = lgb.LGBMClassifier(n_estimators = 100, learning_rate = 0.1),
    #classifier = RandomForestClassifier(n_estimators=100),
    require_dense = [False, True]
)

classifier.fit(X_tr, y_train);

Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations
Finished loading model, total used 100 iterations


In [38]:
out = classifier.predict_proba(X_te)
y_pred = np.array([[1 if out[i,j]>=0.5 else 0 for j in range(y_test.shape[1])] for i in range(len(y_test))])

In [39]:
df_scores_test.loc[df_scores_test.shape[0]] = ['CC-LGBM'] + list(scores(y_test, y_pred))
df_scores_test

Unnamed: 0,Method,Accuracy,Hamming Loss,Jaccard Score,F_1 Score
0,CC-LGBM,0.062,0.29095,0.465306,0.605706


In [40]:
pred_test_df[cols] = y_pred
pred_test_df.head()

Unnamed: 0,H,P,User1,User2,User3,User4,User5,User6,User7,User8,User9,User10,Matrix
0,H_1.h5,P_1.h5,0,1,0,0,1,0,0,0,1,1,"[[[1.516401437104227, 0.0], [129.2895136471787..."
1,H_2.h5,P_2.h5,1,0,0,0,1,0,1,1,0,0,"[[[15.807212199907328, 0.0], [5.22225822717810..."
2,H_3.h5,P_3.h5,0,0,1,0,0,1,1,0,0,1,"[[[5.437638959647273, 0.0], [5.193951816534471..."
3,H_4.h5,P_4.h5,0,1,0,0,1,1,0,0,0,0,"[[[3.198971794472364, 0.0], [8.187352466856316..."
4,H_5.h5,P_5.h5,0,1,0,0,0,0,1,1,0,1,"[[[2.720206087097921, 0.0], [7.87050437793912,..."


In [41]:
test_df['sum_rate'] = test_df.apply(sum_rate, axis = 1)
pred_test_df['sum_rate'] = pred_test_df.apply(sum_rate, axis = 1)

In [42]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(test_df['sum_rate'],pred_test_df['sum_rate']))
mape = np.mean(np.abs((test_df['sum_rate'] - pred_test_df['sum_rate']) / test_df['sum_rate'])) * 100
print('RMSE of the sum rate on the test set is:', round(rmse,2))
print('MAPE of the sum rate on the test set is:', round(mape,2))

RMSE of the sum rate on the test set is: 5.68
MAPE of the sum rate on the test set is: 17.74


Let's time the prediction of one instance using the proposed ML model.

In [43]:
import time
total = []
for i in range(10):
    t0 = time.time()
    y_pred = classifier.predict(X_te[1,:])
    t1 = time.time()
    total.append(t1-t0)

In [44]:
sum(total)/10

0.024399924278259277