In [1]:
from pymongo import MongoClient
import pandas as pd

client = MongoClient("mongodb://localhost:27017/")
db = client["behavior_db"]

events = list(db.events.find())
df = pd.DataFrame(events)

df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.sort_values("timestamp")

df.head()


Unnamed: 0,_id,userId,eventType,timestamp,__v
0,6944fecf2caa4f5fe36904bc,u1,START,2025-12-19 07:29:19.344,0
1,694509491e33bc3aecc5cb81,u1,START,2025-12-19 08:14:01.809,0
2,6945094c1e33bc3aecc5cb83,u1,SWITCH,2025-12-19 08:14:04.757,0
3,6945094d1e33bc3aecc5cb85,u1,STOP,2025-12-19 08:14:05.914,0
4,6945094e1e33bc3aecc5cb87,u1,STOP,2025-12-19 08:14:06.830,0


In [2]:
sessions = []
current_session = []

for _, row in df.iterrows():
    if row["eventType"] == "START":
        current_session = [row]
    elif row["eventType"] == "STOP" and current_session:
        current_session.append(row)
        sessions.append(current_session)
        current_session = []
    else:
        if current_session:
            current_session.append(row)

len(sessions)


16

In [3]:
import numpy as np

feature_rows = []

for session in sessions:
    start_time = session[0]["timestamp"]
    end_time = session[-1]["timestamp"]
    
    duration = (end_time - start_time).total_seconds() / 60
    total_events = len(session)
    switch_count = sum(1 for e in session if e["eventType"] == "SWITCH")
    
    switch_rate = switch_count / duration if duration > 0 else 0
    active_ratio = 1 - (switch_count / total_events)

    feature_rows.append({
        "duration": duration,
        "switch_count": switch_count,
        "switch_rate": switch_rate,
        "active_ratio": active_ratio
    })

features_df = pd.DataFrame(feature_rows)
features_df.head()


Unnamed: 0,duration,switch_count,switch_rate,active_ratio
0,0.068417,1,14.616322,0.666667
1,0.0301,1,33.222591,0.666667
2,0.129367,1,7.729967,0.666667
3,0.034417,1,29.05569,0.666667
4,0.1749,0,0.0,1.0


In [4]:
SWITCH_THRESHOLD = 0.25

features_df["label"] = (features_df["switch_rate"] > SWITCH_THRESHOLD).astype(int)

features_df.head()


Unnamed: 0,duration,switch_count,switch_rate,active_ratio,label
0,0.068417,1,14.616322,0.666667,1
1,0.0301,1,33.222591,0.666667,1
2,0.129367,1,7.729967,0.666667,1
3,0.034417,1,29.05569,0.666667,1
4,0.1749,0,0.0,1.0,0


In [5]:
features_df.describe()
features_df["label"].value_counts()


label
1    9
0    7
Name: count, dtype: int64

In [6]:
features_df.to_csv("session_data_1000.csv", index=False)
