# Identifiation system basing on behavioral biometrics: typing errors

## Configuration
### Load dependencies

In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from classifiers import build_tuned_nn, build_tuned_rfc, param_grid
from sklearn.neural_network import MLPClassifier
# from create_model import create_dataset, user_names
from sklearn.svm import SVC
from sklearn.utils import shuffle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import ClassifierMixin
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import resample
from tensorflow import config
from cv import run_cv

[]


#### Configure GPUs

In [4]:
gpus = config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    config.experimental.set_memory_growth(gpu, True)
    print(gpu)

## Define classifiers

In [5]:
CLASSIFIERS = [
    (RandomForestClassifier(), param_grid['Random Forest'], 'Random Forest'),

    (KNeighborsClassifier(), param_grid['K-Nearest Neighbors'], 'K-Nearest Neighbors'),
    (SVC(probability=True), param_grid['SVC'], 'SVC'),

        (GradientBoostingClassifier(),
         param_grid['Gradient Boosting'],
         'Gradient Boosting'),

        (MLPClassifier(), param_grid['MLP Classifier'], 'MLP Classifier'),
]

## Load dataset

In [6]:
X_train, y_train, X_test, y_test = create_dataset(test_ratio=0.5)
# X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=365)

2023-07-29 17:49:05,990 — logger — DEBUG — ----- load_data: start -----
2023-07-29 17:49:06,763 — logger — DEBUG — ----- load_data: finished in 0.77 seconds)-----
2023-07-29 17:49:06,763 — logger — DEBUG — ----- create_labeled_test_and_train_buckets: start -----
2023-07-29 17:49:06,765 — logger — DEBUG — ----- create_labeled_test_and_train_buckets: finished in 0.00 seconds)-----
2023-07-29 17:49:06,766 — logger — DEBUG — ----- scale_data: start -----
2023-07-29 17:49:06,770 — logger — DEBUG — ----- scale_data: finished in 0.00 seconds)-----
2023-07-29 17:49:06,770 — logger — DEBUG — ----- choose_features: start -----
2023-07-29 17:49:06,773 — logger — DEBUG — ----- choose_features: finished in 0.00 seconds)-----
2023-07-29 17:49:06,773 — logger — DEBUG — ----- create_users_ngrams: start -----
2023-07-29 17:49:06,775 — logger — DEBUG — ----- _create_an_user_ngram: start -----
2023-07-29 17:49:13,056 — logger — DEBUG — ----- _create_an_user_ngram: finished in 6.28 seconds)-----
2023-07-2

## Define train loop

In [7]:
def train_loop(X_train, y_train, X_test, y_test, model, params, name):
    results = pd.DataFrame(columns=['clf', 'accuracy', 'f1', 'precision', 'recall', 'user'])
    for user in np.unique(y_train):
        # Create balanced datasets for user and non-user
        user_mask_train = (y_train == user)
        non_user_mask_train = (y_train != user)

        user_mask_test = (y_test == user)
        non_user_mask_test = (y_test != user)

        # Concatenate user and non-user samples (equal numbers)
        X_balanced_train = np.concatenate([X_train[user_mask_train],
                                           resample(X_train[non_user_mask_train],
                                                    n_samples=user_mask_train.sum(),
                                                    random_state=42)])

        y_balanced_train = np.concatenate([np.ones(user_mask_train.sum()),
                                           np.zeros(user_mask_train.sum())])  # 1 for user, 0 for non-user

        X_balanced_test = np.concatenate([X_test[user_mask_test],
                                          resample(X_test[non_user_mask_test],
                                                   n_samples=user_mask_test.sum(),
                                                   random_state=42)])

        y_balanced_test = np.concatenate([np.ones(user_mask_test.sum()),
                                          np.zeros(user_mask_test.sum())])  # 1 for user, 0 for non-user

        # Shuffling the balanced train and test sets
        X_balanced_train, y_balanced_train = shuffle(X_balanced_train, y_balanced_train, random_state=42)
        X_balanced_test, y_balanced_test = shuffle(X_balanced_test, y_balanced_test, random_state=42)
        if 'neural network' in name.lower():
            res = run_cv(model, params,  X_balanced_train, y_balanced_train,  X_balanced_test, y_balanced_test, predef_model=True)
            
        else: 
            res = run_cv(model, params, X_balanced_train, y_balanced_train,  X_balanced_test, y_balanced_test,  predef_model=False)
        res['user'] = user
        results = pd.merge(results, res, how='outer')
    
        

### tmp

In [8]:
nn = KerasNNClf(X_train.shape[1], len(user_names.keys()), True).create_neural_network()
      

In [9]:
def train_loop_test(X_train, y_train, X_test, y_test, model, params):
    results = pd.DataFrame(columns=['clf', 'accuracy', 'f1', 'precision', 'recall', 'user'])
    for user in np.unique(y_train):
        # Create balanced datasets for user and non-user
        user_mask_train = (y_train == user)
        non_user_mask_train = (y_train != user)

        user_mask_test = (y_test == user)
        non_user_mask_test = (y_test != user)

        # Concatenate user and non-user samples (equal numbers)
        X_balanced_train = np.concatenate([X_train[user_mask_train],
                                           resample(X_train[non_user_mask_train],
                                                    n_samples=user_mask_train.sum(),
                                                    random_state=42)])

        y_balanced_train = np.concatenate([np.ones(user_mask_train.sum()),
                                           np.zeros(user_mask_train.sum())])  # 1 for user, 0 for non-user

        X_balanced_test = np.concatenate([X_test[user_mask_test],
                                          resample(X_test[non_user_mask_test],
                                                   n_samples=user_mask_test.sum(),
                                                   random_state=42)])

        y_balanced_test = np.concatenate([np.ones(user_mask_test.sum()),
                                          np.zeros(user_mask_test.sum())])  # 1 for user, 0 for non-user

        # Shuffling the balanced train and test sets
        X_balanced_train, y_balanced_train = shuffle(X_balanced_train, y_balanced_train, random_state=42)
        X_balanced_test, y_balanced_test = shuffle(X_balanced_test, y_balanced_test, random_state=42)
        if  isinstance(model, ClassifierMixin):
            res = run_cv(model, None,  X_balanced_train, y_balanced_train,  X_balanced_test, y_balanced_test, predef_model=True)
            
        else: 
            res = run_cv(model, params, X_balanced_train, y_balanced_train,  X_balanced_test, y_balanced_test, predef_model=False)
        res['user'] = user
        results = pd.merge(results, res, how='outer')
    return results

In [5]:
train_loop_test(X_train, y_train, X_test, y_test, nn, None)

NameError: name 'train_loop_test' is not defined

tmp #2 

In [1]:
from create_model import create_dataset, user_names
X_train, y_train, X_test, y_test = create_dataset(test_ratio=0.5, if_separate_words=False)

2023-07-29 19:16:30.260320: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-29 19:16:30.310043: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-29 19:16:30.310885: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available:  0
2023-07-29 19:16:31,617 — logger — DEBUG — ----- load_data: start -----
2023-07-29 19:16:32,252 — logger — DEBUG — ----- load_data: finished in 0.63 seconds)-----
2023-07-29 19:16:32,252 — logger — DEBUG — ----- create_labeled_test_and_train_buckets: start -----
2023-07-29 19:16:32,253 — logger — DEBUG — ----- create_labeled_test_and_train_buckets: finished in 0.00 seconds)-----
2023-07-29 19:16:32,253 — logger — DEBUG — ----- scale_data: start -----
2023-07-29 19:16:32,256 — logger — DEBUG — ----- scale_data: finished in 0.00 seconds)-----
2023-07-29 19:16:32,257 — logger — DEBUG — ----- choose_features: start -----
2023-07-29 19:16:32,259 — logger — DEBUG — ----- choose_features: finished in 0.00 seconds)-----
2023-07-29 19:16:32,259 — logger — DEBUG — ----- create_users_ngrams: start -----
2023-07-29 19:16:32,261 — logger — DEBUG — ----- _create_an_user_ngram: start -----
2023-07-29 19:16:38,431 — logger — DEBUG — ----- _create_an_user_ngram: finished in 6.17 

In [4]:
history_dict = {}
for user in np.unique(y_train):
        # Create balanced datasets for user and non-user
        user_mask_train = (y_train == user)
        non_user_mask_train = (y_train != user)

        user_mask_test = (y_test == user)
        non_user_mask_test = (y_test != user)

        # Concatenate user and non-user samples (equal numbers)
        X_balanced_train = np.concatenate([X_train[user_mask_train],
                                           resample(X_train[non_user_mask_train],
                                                    n_samples=user_mask_train.sum(),
                                                    random_state=42)])

        y_balanced_train = np.concatenate([np.ones(user_mask_train.sum()),
                                           np.zeros(user_mask_train.sum())])  # 1 for user, 0 for non-user

        X_balanced_test = np.concatenate([X_test[user_mask_test],
                                          resample(X_test[non_user_mask_test],
                                                   n_samples=user_mask_test.sum(),
                                                   random_state=42)])

        y_balanced_test = np.concatenate([np.ones(user_mask_test.sum()),
                                          np.zeros(user_mask_test.sum())])  # 1 for user, 0 for non-user

        # Shuffling the balanced train and test sets
        X_balanced_train, y_balanced_train = shuffle(X_balanced_train, y_balanced_train, random_state=42)
        X_balanced_test, y_balanced_test = shuffle(X_balanced_test, y_balanced_test, random_state=42)

        # Create and compile the model
        model = KerasNNClf(X_train.shape[1], len(user_names.keys()), True).create_neural_network()

        # Train the model with fewer epochs and a larger batch size
        history = model.fit(X_balanced_train, y_balanced_train, epochs=100, batch_size=64, callbacks=[model.earlystopping, model.logger])
        history_dict[user] = history
        # Evaluate the model
        loss, accuracy = model.evaluate(X_balanced_test, y_balanced_test)

        print('Test loss:', loss)
        print('Test accuracy:', accuracy)

        # Generate and print the confusion matrix
        y_pred = model.predict(X_balanced_test).ravel()

        y_pred_class = [1 if prob >= 0.5 else 0 for prob in y_pred]
        print(confusion_matrix(y_balanced_test, y_pred_class))

InvalidParameterError: The 'n_samples' parameter of resample must be an int in the range [1, inf) or None. Got 0 instead.