In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
import joblib

In [2]:
# load the Train_keystroke.csv dataset
df_original = pd.read_csv('ThreatFabric_MLOPS_Challenge/Train_keystroke.csv')
df_original

Unnamed: 0,user,press-0,release-0,press-1,release-1,press-2,release-2,press-3,release-3,press-4,...,press-8,release-8,press-9,release-9,press-10,release-10,press-11,release-11,press-12,release-12
0,1,0,120,216,312,424,496,592,664,808,...,1712,1760,1992,2064,2376,2448,2584,2632,2752,2824
1,1,0,95,168,265,360,455,527,599,736,...,1423,1471,1664,1711,1880,1952,2039,2111,2231,2279
2,1,0,71,143,231,783,903,1087,1159,1351,...,2039,2111,2271,2343,2487,2559,2679,2751,2871,2926
3,1,0,95,144,263,353,431,760,832,1159,...,3151,3223,3415,3463,3631,3703,3815,3887,3983,4055
4,1,0,70,166,238,310,406,526,598,710,...,1310,1382,1543,1605,1734,1806,1926,1998,2086,2182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,110,0,80,225,306,444,465,575,672,689,...,1638,1735,1785,1884,1928,2040,2057,2186,2284,2504
876,110,0,81,217,289,387,473,954,1057,1091,...,1755,1866,1921,2019,2161,2265,2351,2457,2561,2704
877,110,0,79,216,300,393,519,692,807,807,...,1591,1724,1763,1866,1879,2046,2048,2192,2303,2449
878,110,0,84,200,290,384,476,782,930,943,...,1537,1658,1760,1848,1831,1940,1944,2082,2174,2319


In [3]:
# extract the relevant columns
df = df_original.iloc[:, 1:]
df

Unnamed: 0,press-0,release-0,press-1,release-1,press-2,release-2,press-3,release-3,press-4,release-4,...,press-8,release-8,press-9,release-9,press-10,release-10,press-11,release-11,press-12,release-12
0,0,120,216,312,424,496,592,664,808,856,...,1712,1760,1992,2064,2376,2448,2584,2632,2752,2824
1,0,95,168,265,360,455,527,599,736,807,...,1423,1471,1664,1711,1880,1952,2039,2111,2231,2279
2,0,71,143,231,783,903,1087,1159,1351,1454,...,2039,2111,2271,2343,2487,2559,2679,2751,2871,2926
3,0,95,144,263,353,431,760,832,1159,1207,...,3151,3223,3415,3463,3631,3703,3815,3887,3983,4055
4,0,70,166,238,310,406,526,598,710,758,...,1310,1382,1543,1605,1734,1806,1926,1998,2086,2182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,0,80,225,306,444,465,575,672,689,784,...,1638,1735,1785,1884,1928,2040,2057,2186,2284,2504
876,0,81,217,289,387,473,954,1057,1091,1189,...,1755,1866,1921,2019,2161,2265,2351,2457,2561,2704
877,0,79,216,300,393,519,692,807,807,894,...,1591,1724,1763,1866,1879,2046,2048,2192,2303,2449
878,0,84,200,290,384,476,782,930,943,978,...,1537,1658,1760,1848,1831,1940,1944,2082,2174,2319


In [4]:
# create a function to extract the four features (Hold Time, Press-Press time, Release-Release Time, Release-Press time)
def extract_features(df):
    hold_time = []
    press_press_time = []
    release_release_time = []
    release_press_time = []
    for i in range(0, len(df), 2):
        hold_time.append(df[i+1] - df[i])
        if i > 0:
            press_press_time.append(df[i] - df[i-2])
            release_release_time.append(df[i+1] - df[i-1])
            release_press_time.append(df[i] - df[i-1])
    return np.array(hold_time), np.array(press_press_time), np.array(release_release_time), np.array(release_press_time)

In [5]:
# initialize arrays to store the extracted features
hold_time = []
press_press_time = []
release_release_time = []
release_press_time = []

In [6]:
# loop through each row of the dataset and extract the features
for i in range(df.shape[0]):
    ht, ppt, rrt, rpt = extract_features(df.iloc[i, :])
    hold_time.append(ht)
    press_press_time.append(ppt)
    release_release_time.append(rrt)
    release_press_time.append(rpt)
len(hold_time)

880

In [7]:
# calculate mean and standard deviation for each feature per row
hold_time_mean = [np.mean(i) for i in hold_time]
hold_time_std = [np.std(i) for i in hold_time]
press_press_time_mean = [np.mean(i) for i in press_press_time]
press_press_time_std = [np.std(i) for i in press_press_time]
release_release_time_mean = [np.mean(i) for i in release_release_time]
release_release_time_std = [np.std(i) for i in release_release_time]
release_press_time_mean = [np.mean(i) for i in release_press_time]
release_press_time_std = [np.std(i) for i in release_press_time]
len(hold_time_mean)

880

In [14]:
# store the mean and standard deviation in a dataframe
features = pd.DataFrame({
    'hold_time_mean': hold_time_mean,
    'hold_time_std': hold_time_std,
    'press_press_time_mean': press_press_time_mean,
    'press_press_time_std': press_press_time_std,
    'release_release_time_mean': release_release_time_mean,
    'release_release_time_std': release_release_time_std,
    'release_press_time_mean': release_press_time_mean,
    'release_press_time_std': release_press_time_std
})
features

Unnamed: 0,hold_time_mean,hold_time_std,press_press_time_mean,press_press_time_std,release_release_time_mean,release_release_time_std,release_press_time_mean,release_press_time_std
0,72.000000,21.049392,229.333333,60.428102,225.333333,69.958718,157.333333,65.019655
1,70.384615,16.836492,185.916667,34.862727,182.000000,38.464269,113.666667,44.548351
2,80.307692,16.498879,239.250000,130.555943,237.916667,139.135700,156.833333,127.846805
3,72.000000,20.373437,331.916667,352.649604,330.000000,338.389568,259.916667,347.130249
4,74.769231,13.325244,173.833333,45.487788,176.000000,38.177218,100.833333,45.586609
...,...,...,...,...,...,...,...,...
875,111.307692,44.551413,190.333333,85.660506,202.000000,97.465378,88.083333,86.111806
876,103.076923,22.434388,213.416667,111.654570,218.583333,123.011828,113.666667,120.459767
877,123.615385,33.403221,191.916667,52.506283,197.500000,70.785474,70.166667,63.806783
878,108.230769,36.077169,181.166667,80.955578,186.250000,101.725222,76.000000,85.815694


In [9]:
# extract the UserID as the class
class_label = df_original['user'].values
class_label

array([  1,   1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,
         2,   2,   2,   3,   3,   3,   3,   3,   3,   3,   3,   4,   4,
         4,   4,   4,   4,   4,   4,   5,   5,   5,   5,   5,   5,   5,
         5,   6,   6,   6,   6,   6,   6,   6,   6,   7,   7,   7,   7,
         7,   7,   7,   7,   8,   8,   8,   8,   8,   8,   8,   8,   9,
         9,   9,   9,   9,   9,   9,   9,  10,  10,  10,  10,  10,  10,
        10,  10,  11,  11,  11,  11,  11,  11,  11,  11,  12,  12,  12,
        12,  12,  12,  12,  12,  13,  13,  13,  13,  13,  13,  13,  13,
        14,  14,  14,  14,  14,  14,  14,  14,  15,  15,  15,  15,  15,
        15,  15,  15,  16,  16,  16,  16,  16,  16,  16,  16,  17,  17,
        17,  17,  17,  17,  17,  17,  18,  18,  18,  18,  18,  18,  18,
        18,  19,  19,  19,  19,  19,  19,  19,  19,  20,  20,  20,  20,
        20,  20,  20,  20,  21,  21,  21,  21,  21,  21,  21,  21,  22,
        22,  22,  22,  22,  22,  22,  22,  23,  23,  23,  23,  2

In [10]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, class_label, test_size=0.2)
# train an SVM classifier
svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)

In [11]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, class_label, test_size=0.2)
# train a Random Forest classifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)

In [12]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, class_label, test_size=0.2)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)

# train an XGBoost classifier
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train)

In [13]:
# store the trained models in your local device
joblib.dump(svm_clf, 'svm_model.pkl')
joblib.dump(rf_clf, 'rf_model.pkl')
joblib.dump(xgb_clf, 'xgb_model.pkl')

['xgb_model.pkl']