In [83]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack


In [84]:
# Load mental health dataset
df_text = pd.read_csv("dataset/mental_health.csv.zip")

# Add subject IDs (0, 1, 2, â€¦)
df_text['subject'] = range(len(df_text))

# Load keystroke dataset
df_keys = pd.read_csv("dataset/DSL-StrongPasswordData.csv")

# Overwrite subject column with sequential numbers
df_keys['subject'] = range(len(df_keys))
# Example: add fake labels just for testing
import numpy as np
df_keys['label'] = np.random.randint(0, 2, size=len(df_keys))





In [85]:
# Merge safely: keep text + label from mental health dataset
df = pd.merge(
    df_text[['subject','text','label']],  # only keep subject, text, label
    df_keys, 
    on="subject"
)

print(df.columns)  # check columns



Index(['subject', 'text', 'label_x', 'sessionIndex', 'rep', 'H.period',
       'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i',
       'DD.i.e', 'UD.i.e', 'H.e', 'DD.e.five', 'UD.e.five', 'H.five',
       'DD.five.Shift.r', 'UD.five.Shift.r', 'H.Shift.r', 'DD.Shift.r.o',
       'UD.Shift.r.o', 'H.o', 'DD.o.a', 'UD.o.a', 'H.a', 'DD.a.n', 'UD.a.n',
       'H.n', 'DD.n.l', 'UD.n.l', 'H.l', 'DD.l.Return', 'UD.l.Return',
       'H.Return', 'label_y'],
      dtype='object')


In [86]:
# Keep the real labels from the text dataset
df.rename(columns={'label_x':'label'}, inplace=True)

# Drop the fake labels from keystroke dataset
df.drop(columns=['label_y'], inplace=True)

print(df.columns)  # should now show 'subject', 'text', 'label', plus keystroke features


Index(['subject', 'text', 'label', 'sessionIndex', 'rep', 'H.period',
       'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i',
       'DD.i.e', 'UD.i.e', 'H.e', 'DD.e.five', 'UD.e.five', 'H.five',
       'DD.five.Shift.r', 'UD.five.Shift.r', 'H.Shift.r', 'DD.Shift.r.o',
       'UD.Shift.r.o', 'H.o', 'DD.o.a', 'UD.o.a', 'H.a', 'DD.a.n', 'UD.a.n',
       'H.n', 'DD.n.l', 'UD.n.l', 'H.l', 'DD.l.Return', 'UD.l.Return',
       'H.Return'],
      dtype='object')


In [87]:
df["text"] = df["text"].str.lower()

X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer()
X_train_text_vec = vectorizer.fit_transform(X_train_text)
X_test_text_vec = vectorizer.transform(X_test_text)



In [88]:
df['avg_hold'] = df[[c for c in df.columns if c.startswith("H.")]].mean(axis=1)
df['avg_dd'] = df[[c for c in df.columns if c.startswith("DD.")]].mean(axis=1)
df['avg_ud'] = df[[c for c in df.columns if c.startswith("UD.")]].mean(axis=1)
X_keys = df[['avg_hold','avg_dd','avg_ud']].values 
y_keys = df['label'].values
scaler = StandardScaler() 
X_keys_scaled = scaler.fit_transform(X_keys)

In [89]:
X_train_keys, X_test_keys, y_train_keys, y_test_keys = train_test_split( X_keys_scaled, y_keys, test_size=0.2, random_state=42 )

In [90]:
model_keys = LogisticRegression(max_iter=1000) 
model_keys.fit(X_train_keys, y_train_keys) 
pred_keys = model_keys.predict(X_test_keys) 
print("Keystroke Model Accuracy:", accuracy_score(y_test_keys, pred_keys))

Keystroke Model Accuracy: 0.5102941176470588


In [91]:
X_train_combined = hstack([X_train_text_vec, X_train_keys])
X_test_combined = hstack([X_test_text_vec, X_test_keys])


In [92]:

model = LogisticRegression(max_iter=1000)
model.fit(X_train_combined, y_train)

pred = model.predict(X_test_combined)
print("Combined Model Accuracy:", accuracy_score(y_test, pred))



Combined Model Accuracy: 0.9139705882352941


In [93]:

sample_text = ["I feel hopeless and lost"]
sample_text_vec = vectorizer.transform(sample_text)

# Example typing features (replace with real values)
sample_typing = np.array([[2.0, 4.0, 5.0]])  # avg_hold, avg_dd, avg_ud
sample_typing_scaled = scaler.transform(sample_typing)

sample_combined = hstack([sample_text_vec, sample_typing_scaled])
print("Probabilities:", model.predict_proba(sample_combined))




Probabilities: [[0.87465912 0.12534088]]
