In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from features_flattener import split_sessions, flatten_features, get_top_digraphs,clean_feature_names

In [2]:
user_files = {
    "advait": "data/advait_keystrokes.csv",
    "chetan": "data/chetan_keystrokes.csv",
    "gayaa": "data/gayatri_keystrokes.csv",
    "purva": "data/purva_keystrokes.csv",
    "siddhant": "data/siddhant_keystrokes.csv",
}

In [3]:
top_digraphs = get_top_digraphs(user_files.values(), N=20)
top_digraphs

[('e', 'Key.space'),
 ('t', 'Key.space'),
 ('n', 'Key.space'),
 ('Key.space', 'w'),
 ('Key.space', 's'),
 ('Key.space', 't'),
 ('e', 'r'),
 ('t', 'h'),
 ('d', 'Key.space'),
 ('Key.space', 'a'),
 ('a', 'n'),
 ('r', 'Key.space'),
 ('Key.space', 'o'),
 ('h', 'e'),
 ('i', 'n'),
 ('Key.backspace', 'Key.backspace'),
 ('Key.space', 'h'),
 ('o', 'u'),
 ('y', 'Key.space'),
 ('Key.space', 'm')]

In [4]:
rows = []
for user, file in user_files.items():
    sessions = split_sessions(file, session_size=100)
    for sess in sessions:
        row = flatten_features(sess, user, top_digraphs)
        rows.append(row)

In [5]:
df = pd.DataFrame(rows).fillna(0)
df.to_csv("data/processed_data.csv")
print(df.shape)
df['label']

(1059, 84)


0         advait
1         advait
2         advait
3         advait
4         advait
          ...   
1054    siddhant
1055    siddhant
1056    siddhant
1057    siddhant
1058    siddhant
Name: label, Length: 1059, dtype: object

In [6]:
X = clean_feature_names(df.drop("label", axis=1))
y = df["label"]
le = LabelEncoder()
y_encoded = le.fit_transform(y)   # 'advait' -> 0, 'chetan' -> 1, etc.

# Train/test split with encoded labels
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.18, random_state=42,stratify=y_encoded)


In [7]:
X = clean_feature_names(df.drop("label", axis=1)).astype(float)
y = df["label"]
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.18, random_state=42, stratify=y_encoded
)

model = XGBClassifier(eval_metric='mlogloss')
model.fit(X_train.values, y_train)

y_pred = model.predict(X_test.values)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9895287958115183


In [8]:
import joblib

joblib.dump(model, "models/xgb_keystroke_model.pkl")
joblib.dump(le, "models/label_encoder.pkl")
joblib.dump(top_digraphs, "models/top_digraphs.pkl")
feature_names = list(X.columns)
joblib.dump(feature_names, "models/feature_names.pkl")



['models/feature_names.pkl']