In [18]:
# 1. Load Analytics Data
import pandas as pd

df = pd.read_csv("analytics.csv")
df.head()


Unnamed: 0,user_id,challenge_id,event_type,timestamp,metadata
0,user_1,sql-injection,start,2025-05-06 18:42:11.331816,{}
1,user_1,sql-injection,hint,2025-05-06 18:42:11.331816,{'level': 1}
2,user_1,sql-injection,complete,2025-05-06 18:42:11.331816,{}
3,user_1,xss,start,2025-05-06 18:42:11.331841,{}
4,user_1,xss,complete,2025-05-06 18:42:11.331841,{}


In [19]:
# 2. Feature Engineering
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Example: Total hints per challenge
hint_data = df[df['event_type'] == 'hint']
hint_counts = hint_data.groupby(['user_id', 'challenge_id']).size().reset_index(name='hint_count')

# Completion flag
df['completed'] = df['event_type'].apply(lambda x: 1 if x == 'complete' else 0)

# Merge hint count
features = df[df['event_type'] == 'start'][['user_id', 'challenge_id', 'timestamp']].merge(hint_counts, on=['user_id', 'challenge_id'], how='left')
features = features.fillna({'hint_count': 0})


In [20]:
# 3. Create Labels (Difficulty level - mock or mapped externally)
import numpy as np
features['difficulty'] = np.where(features['hint_count'] > 2, 'hard',
                          np.where(features['hint_count'] > 0, 'medium', 'easy'))


In [21]:
# 4. Encode + Train Model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

X = features[['hint_count']]
y = features['difficulty']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

print("Accuracy:", model.score(X_test, y_test))


Accuracy: 1.0


In [22]:
# 5. Predict on New Behavior
model.predict([[0]])  # Low hint user → likely "easy"
model.predict([[3]])  # High hint → "hard"




array([1])

In [23]:
import joblib

# Save both model and label encoder
joblib.dump(model, "model/challenge_difficulty_model.pkl")
joblib.dump(le, "model/label_encoder.pkl")


['model/label_encoder.pkl']