Skip to content

Commit

Permalink
Convert ids/timestamps to int + add utils/tests
Browse files Browse the repository at this point in the history
  • Loading branch information
BenoitChoffin committed Aug 7, 2019
1 parent aebc929 commit a89fa46
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 2 deletions.
8 changes: 6 additions & 2 deletions prepare_data.py
Expand Up @@ -27,7 +27,8 @@ def prepare_assistments12(min_interactions_per_user, remove_nan_skills):
df["timestamp"] = df["start_time"]
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["timestamp"] = df["timestamp"] - df["timestamp"].min()
df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds() / (3600*24))
#df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds() / (3600*24))
df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds()).astype(np.int64)
df.sort_values(by="timestamp", inplace=True)
df.reset_index(inplace=True, drop=True)
df = df.groupby("user_id").filter(lambda x: len(x) >= min_interactions_per_user)
Expand All @@ -52,6 +53,7 @@ def prepare_assistments12(min_interactions_per_user, remove_nan_skills):

df = df[['user_id', 'item_id', 'timestamp', 'correct', "inter_id"]]
df = df[df.correct.isin([0,1])] # Remove potential continuous outcomes
df['correct'] = df['correct'].astype(np.int32) # Cast outcome as int32

# Save data
sparse.save_npz("data/assistments12/q_mat.npz", sparse.csr_matrix(Q_mat))
Expand Down Expand Up @@ -85,7 +87,8 @@ def prepare_kddcup10(data_name, min_interactions_per_user, kc_col_name,
})[['user_id', 'pb_id', 'step_id' ,'correct', 'timestamp', 'kc_id']]
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["timestamp"] = df["timestamp"] - df["timestamp"].min()
df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds() / (3600*24))
#df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds() / (3600*24))
df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds()).astype(np.int64)
df.sort_values(by="timestamp",inplace=True)
df.reset_index(inplace=True,drop=True)
df = df.groupby("user_id").filter(lambda x: len(x) >= min_interactions_per_user)
Expand Down Expand Up @@ -132,6 +135,7 @@ def prepare_kddcup10(data_name, min_interactions_per_user, kc_col_name,

df = df[['user_id', 'item_id', 'timestamp', 'correct', 'inter_id']]
df = df[df.correct.isin([0,1])] # Remove potential continuous outcomes
df['correct'] = df['correct'].astype(np.int32) # Cast outcome as int32

# Save data
sparse.save_npz(folder_path + "/q_mat.npz", sparse.csr_matrix(Q_mat))
Expand Down
Empty file added tests/__init__.py
Empty file.
29 changes: 29 additions & 0 deletions tests/test_this_queue.py
@@ -0,0 +1,29 @@
import unittest
from utils.this_queue import OurQueue

# From JJ's ktm repo: https://github.com/jilljenn/ktm

class TestOurQueue(unittest.TestCase):

def test_simple(self):
q = OurQueue()
q.push(0)
q.push(0.8 * 3600 * 24)
q.push(5 * 3600 * 24)
q.push(40 * 3600 * 24)
self.assertEqual(q.get_counters(40 * 3600 * 24), [4, 1, 1, 1, 1])

def test_complex(self):
q = OurQueue()
q.push(0)
q.push(10)
q.push(3599)
q.push(3600)
q.push(3601)
q.push(3600 * 24)
q.push(3600 * 24 + 1)
q.push(3600 * 24 * 7)
q.push(3600 * 24 * 7 + 1)
q.push(3600 * 24 * 7 * 30)
q.push(3600 * 24 * 7 * 30 + 1)
self.assertEqual(q.get_counters(3600 * 24 * 7 * 30 + 1), [11, 2, 2, 2, 2])
30 changes: 30 additions & 0 deletions utils/this_queue.py
@@ -0,0 +1,30 @@
class OurQueue:
"""
A queue for counting efficiently the number of events within time windows.
Complexity:
All operators in amortized O(W) time where W is the number of windows.
From JJ's KTM repository: https://github.com/jilljenn/ktm.
"""
def __init__(self):
#self.now = None
self.queue = []
self.window_lengths = [3600 * 24 * 30, 3600 * 24 * 7, 3600 * 24, 3600]
self.cursors = [0] * len(self.window_lengths)

def __len__(self):
return len(self.queue)

def get_counters(self, t):
self.update_cursors(t)
return [len(self.queue)] + [len(self.queue) - cursor
for cursor in self.cursors]

def push(self, time):
self.queue.append(time)

def update_cursors(self, t):
for pos, length in enumerate(self.window_lengths):
while (self.cursors[pos] < len(self.queue) and
t - self.queue[self.cursors[pos]] >= length):
self.cursors[pos] += 1

0 comments on commit a89fa46

Please sign in to comment.