# Identifiation system basing on behavioral biometrics: typing errors

## Configuration
### Load dependencies

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from classifiers import build_tuned_nn, build_tuned_rfc, param_grid
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.utils import shuffle
import numpy as np
from sklearn.base import ClassifierMixin
from sklearn.metrics import  confusion_matrix
from sklearn.utils import resample
from tensorflow import config
from cv import run_cv, run_cv_neural_network
from create_model import create_dataset, NUMBER_OF_FEATURES, N_GRAM_SIZE
from sklearn.preprocessing import Normalizer, StandardScaler

2023-08-22 06:55:52.189958: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-22 06:55:52.230315: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-22 06:55:52.230947: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available:  0
[]
2023-08-22 06:55:53,738 — matplotlib — DEBUG — matplotlib data path: /home/bartek/PycharmProjects/Biometric_system_based_on_behavioral_biometrics_typing_errors_in_English/venv/lib/python3.11/site-packages/matplotlib/mpl-data
2023-08-22 06:55:53,743 — matplotlib — DEBUG — CONFIGDIR=/home/bartek/.config/matplotlib
2023-08-22 06:55:53,745 — matplotlib — DEBUG — interactive is False
2023-08-22 06:55:53,745 — matplotlib — DEBUG — platform is linux
2023-08-22 06:55:53,797 — matplotlib — DEBUG — CACHEDIR=/home/bartek/.cache/matplotlib
2023-08-22 06:55:53,800 — matplotlib.font_manager — DEBUG — Using fontManager instance from /home/bartek/.cache/matplotlib/fontlist-v330.json


#### Configure GPUs

In [2]:
gpus = config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    config.experimental.set_memory_growth(gpu, True)
    print(gpu)

## Define classifiers

In [3]:
CLASSIFIERS = [
               (RandomForestClassifier(max_depth=None,              # Allow trees to grow as deep as they can
                                    min_impurity_decrease=0.001),  # Avoid splits that don't provide at least 0.001 impurity decrease
     
     param_grid['Random Forest'], 'Random Forest'),

    (KNeighborsClassifier(), param_grid['K-Nearest Neighbors'], 'K-Nearest Neighbors'),
    (SVC(probability=True), param_grid['SVC'], 'SVC'),

        (GradientBoostingClassifier(),
         param_grid['Gradient Boosting'],
         'Gradient Boosting'),

        (MLPClassifier(), param_grid['MLP Classifier'], 'MLP Classifier'),
]

## Load dataset

In [4]:
X, y, X_test, y_test, cols = create_dataset(if_separate_words=True, test_ratio=0.5, verbose_mode=True, scaler=Normalizer())     

2023-08-22 06:55:54,105 — logger — DEBUG — ----- load_data: start -----
2023-08-22 06:55:54,107 — logger — DEBUG — ----- get_misspelled_words_df_from_json: start -----
2023-08-22 06:55:54,202 — logger — DEBUG — ----- get_misspelled_words_df_from_json: finished in 0.10 seconds)-----
2023-08-22 06:55:54,203 — logger — DEBUG — ----- get_misspelled_words_df_from_json: start -----
2023-08-22 06:55:54,377 — logger — DEBUG — ----- get_misspelled_words_df_from_json: finished in 0.17 seconds)-----
2023-08-22 06:55:54,378 — logger — DEBUG — ----- get_misspelled_words_df_from_json: start -----
2023-08-22 06:55:54,467 — logger — DEBUG — ----- get_misspelled_words_df_from_json: finished in 0.09 seconds)-----
2023-08-22 06:55:54,467 — logger — DEBUG — ----- get_misspelled_words_df_from_json: start -----
2023-08-22 06:55:54,543 — logger — DEBUG — ----- get_misspelled_words_df_from_json: finished in 0.07 seconds)-----
2023-08-22 06:55:54,543 — logger — DEBUG — ----- get_misspelled_words_df_from_json: 

## Define train loop

In [5]:
def train_loop(model, params, X_train, y_train, X_test, y_test, name):
    results = pd.DataFrame(columns=['clf', 'accuracy', 'f1', 'precision', 'recall', 'user'])
    for user in np.unique(y_train):
        # Create balanced datasets for user and non-user
        user_mask_train = (y_train == user)
        non_user_mask_train = (y_train != user)

        user_mask_test = (y_test == user)
        non_user_mask_test = (y_test != user)

        # Concatenate user and non-user samples (equal numbers)
        X_balanced_train = np.concatenate([X_train[user_mask_train],
                                           resample(X_train[non_user_mask_train],
                                                    n_samples=user_mask_train.sum(),
                                                    random_state=42)])

        y_balanced_train = np.concatenate([np.ones(user_mask_train.sum()),
                                           np.zeros(user_mask_train.sum())])  # 1 for user, 0 for non-user

        X_balanced_test = np.concatenate([X_test[user_mask_test],
                                          resample(X_test[non_user_mask_test],
                                                   n_samples=user_mask_test.sum(),
                                                   random_state=42)])

        y_balanced_test = np.concatenate([np.ones(user_mask_test.sum()),
                                          np.zeros(user_mask_test.sum())])  # 1 for user, 0 for non-user

        # Shuffling the balanced train and test sets
        X_balanced_train, y_balanced_train = shuffle(X_balanced_train, y_balanced_train, random_state=42)
        X_balanced_test, y_balanced_test = shuffle(X_balanced_test, y_balanced_test, random_state=42)
        if 'neural network' in name.lower():
            res = run_cv_neural_network(model, params,  X_balanced_train, y_balanced_train,  X_balanced_test, y_balanced_test, is_multiclass=False)
            
        else: 
            res = run_cv(model, params, X_balanced_train, y_balanced_train,  X_balanced_test, y_balanced_test,   is_multiclass=False, predef_model=False)
        res['user'] = user
        results = pd.merge(results, res, how='outer')
    return results    
    
        

In [6]:
for clf in CLASSIFIERS:
    # (X_train, y_train, X_test, y_test, model, params, name):
    print(clf[0], clf[1],clf[2])
    res = train_loop(clf[0], clf[1], X, y, X_test, y_test, name=clf[2])
    res['number of features'] = NUMBER_OF_FEATURES
    res['ngram size'] = N_GRAM_SIZE
    res['columns'] = str(cols)
    res.to_csv(f'results_verification/{clf[2]}.csv', mode='a+')

RandomForestClassifier(min_impurity_decrease=0.001) {'n_estimators': [10, 50, 100], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': ['log2', 'sqrt', None]} Random Forest


AttributeError: 'RandomForestClassifier' object has no attribute 'estimators_'

### tmp

In [None]:
def train_loop_test(X_train, y_train, X_test, y_test, model, params):
    results = pd.DataFrame(columns=['clf', 'accuracy', 'f1', 'precision', 'recall', 'user'])
    for user in np.unique(y_train):
        # Create balanced datasets for user and non-user
        user_mask_train = (y_train == user)
        non_user_mask_train = (y_train != user)

        user_mask_test = (y_test == user)
        non_user_mask_test = (y_test != user)

        # Concatenate user and non-user samples (equal numbers)
        X_balanced_train = np.concatenate([X_train[user_mask_train],
                                           resample(X_train[non_user_mask_train],
                                                    n_samples=user_mask_train.sum(),
                                                    random_state=42)])

        y_balanced_train = np.concatenate([np.ones(user_mask_train.sum()),
                                           np.zeros(user_mask_train.sum())])  # 1 for user, 0 for non-user

        X_balanced_test = np.concatenate([X_test[user_mask_test],
                                          resample(X_test[non_user_mask_test],
                                                   n_samples=user_mask_test.sum(),
                                                   random_state=42)])

        y_balanced_test = np.concatenate([np.ones(user_mask_test.sum()),
                                          np.zeros(user_mask_test.sum())])  # 1 for user, 0 for non-user

        # Shuffling the balanced train and test sets
        X_balanced_train, y_balanced_train = shuffle(X_balanced_train, y_balanced_train, random_state=42)
        X_balanced_test, y_balanced_test = shuffle(X_balanced_test, y_balanced_test, random_state=42)
        res = run_cv(model, params, X_balanced_train, y_balanced_train,  X_balanced_test, y_balanced_test, predef_model=False, plot_path='verification_results/')
        res['user'] = user
        results = pd.merge(results, res, how='outer')
    return results