# Identifiation system basing on behavioral biometrics: typing errors - dataset analysis

## Configuration
### Load dependencies

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from classifiers import build_tuned_nn, build_tuned_rfc, param_grid
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from tensorflow import config
from cv import run_cv_neural_network
import pandas as pd


#### Configure GPUs

In [None]:
gpus = config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    config.experimental.set_memory_growth(gpu, True)
    print(gpu)

## Define classifiers

In [None]:
CLASSIFIERS = [
    (RandomForestClassifier(), param_grid['Random Forest'], 'Random Forest'),

    (KNeighborsClassifier(), param_grid['K-Nearest Neighbors'], 'K-Nearest Neighbors'),
    (SVC(probability=True), param_grid['SVC'], 'SVC'),

        (GradientBoostingClassifier(),
         param_grid['Gradient Boosting'],
         'Gradient Boosting'),

        (MLPClassifier(), param_grid['MLP Classifier'], 'MLP Classifier'),
]

## Load dataset

In [None]:
from create_model import create_dataset
from sklearn.preprocessing import Normalizer
X_train, y_train, X_test, y_test = create_dataset(test_ratio=0.5, if_separate_words=True, scaler=Normalizer()) 

In [None]:
pd.DataFrame(y_test).value_counts()


In [None]:
# from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
# 
# data_tsne = TSNE().fit_transform(X_train)
# plt.scatter(data_tsne[:, 0], data_tsne[:, 1], y_train)
# plt.show()
y_train

In [None]:
from sklearn.decomposition import PCA
import numpy as np

def determine_optimal_features(dataset, thresh = 0.95):
    # Apply PCA
    pca = PCA()
    pca.fit(dataset)
    
    # Calculate cumulative explained variance
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    
    # Get number of components needed to explain a certain amount of variance, e.g., 95%  
    return np.argmax(cumulative_variance > thresh) + 1

In [None]:
results = []
possible_values = list(range(2,11))
num_of_features = list(range(2,11))
NUMBER_OF_FEATURES=15
# Assume possible_values is a list of values you want to iterate over for the parameters
for AMOUNT_OF_N_GRAMS_PERS_USER in possible_values:
    # for NUMBER_OF_FEATURES in possible_values:
        X_train, y_train, X_test, y_test = create_dataset(test_ratio=0.5, if_separate_words=True, scaler=Normalizer()) 
        
        n_components_needed = determine_optimal_features(X_train)
        
        results.append({
            'AMOUNT_OF_N_GRAMS_PERS_USER': AMOUNT_OF_N_GRAMS_PERS_USER,
            'NUMBER_OF_FEATURES': NUMBER_OF_FEATURES,
            'n_components_needed': n_components_needed
        })

In [None]:
pd.DataFrame(results)

In [None]:
plt.hist2d(data_tsne[:, 0], data_tsne[:, 1], bins=30, cmap='Blues')
plt.colorbar()
plt.show()

In [None]:
# fig = plt.figure()
# ax = fig.add_subplot(111, projection='3d')  # create a 3D axis object
# ax.scatter(data_tsne[:, 0], data_tsne[:, 1], y_train,  c=y_train, marker='.')  # scatter plot
# plt.show()

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
pca_data = PCA().fit_transform(X_train)
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=y_train)
plt.show()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')  # create a 3D axis object
ax.scatter(y_train, pca_data[:, 0], pca_data[:, 1], c=y_train,marker='.')  # scatter plot
plt.show()

In [None]:
pca = PCA(n_components=X_train.shape[1])
pca.fit(X_train)

In [None]:
plt.ylabel('Eigenvalues')
plt.xlabel('Number of features')
plt.plot(pca.explained_variance_)

In [None]:
pd.DataFrame(pca.explained_variance_ratio_)

In [None]:
for idx in np.argsort(np.abs(pca.components_[0]))[::-1]:
    print(f"Feature {idx}, Weight: {pca.components_[0][idx]:.4f}")


In [None]:
variance = pca.explained_variance_ratio_
var = np.cumsum(pca.explained_variance_ratio_)*100
plt.ylabel('% variance explaines')
plt.xlabel('Number of features')
plt.plot(var)

In [None]:
# fig = plt.figure()
# ax = fig.add_subplot(111, projection='3d')  # create a 3D axis object
# ax.scatter(data_tsne[:, 0], data_tsne[:, 1], y_train, c=y_train, marker='.')  # scatter plot
# plt.show()

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
pca_data = PCA().fit_transform(X_test)
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=y_test)
plt.show()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')  # create a 3D axis object
ax.scatter(pca_data[:, 0], pca_data[:, 1], y_test, c=y_test, marker='o')  # scatter plot
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

clf = RandomForestClassifier()
# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.5)
clf.fit(X_train, y_train)
print(classification_report(y_train, clf.predict(X_train)))
print(classification_report(y_test, clf.predict(X_test)))
# print(classification_report(y_train, clf.predict(X_train)))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

clf = RandomForestClassifier()
X_train, y_train, X_test, y_test = create_dataset(test_ratio=0.01, if_separate_words=True, scaler=Normalizer()) 
X, X_test, y, y_test = train_test_split(X_train, y_train, test_size=0.5)
clf.fit(X_train, y_train)
run_cv_neural_network(X, y, X_test, y_train)
# print(classification_report(y_train, clf.predict(X_train)))

In [None]:
from scipy.stats import ttest_ind
ttest_results = pd.DataFrame(columns=['Feature', 'T-stat', 'P-value'])


malware = df[df['is_malware'] == 1]
non_malware = df[df['is_malware'] == 0]

for column in df.columns:
    if column != 'is_malware':
        t_stat, p_val = ttest_ind(malware[column], non_malware[column])
    ttest_results = pd.concat([ttest_results,pd.DataFrame({'Feature': column, 'T-stat': t_stat, 'P-value': p_val}, index = [0])] )
    # ttest_results = pd.merge(ttest_results, pd.DataFrame({'Feature': column, 'T-stat': t_stat, 'P-value': p_val}, ignore_index=True))

# Display the DataFrame
ttest_results = ttest_results.reset_index(drop=True).set_index('Feature').transpose