From a89fa46719784cc3d28e8e9365a0f0b274e22322 Mon Sep 17 00:00:00 2001 From: BenoitChoffin Date: Wed, 7 Aug 2019 16:20:07 +0200 Subject: [PATCH] Convert ids/timestamps to int + add utils/tests --- prepare_data.py | 8 ++++++-- tests/__init__.py | 0 tests/test_this_queue.py | 29 +++++++++++++++++++++++++++++ utils/this_queue.py | 30 ++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/test_this_queue.py create mode 100644 utils/this_queue.py diff --git a/prepare_data.py b/prepare_data.py index 1534d81..29a81c0 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -27,7 +27,8 @@ def prepare_assistments12(min_interactions_per_user, remove_nan_skills): df["timestamp"] = df["start_time"] df["timestamp"] = pd.to_datetime(df["timestamp"]) df["timestamp"] = df["timestamp"] - df["timestamp"].min() - df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds() / (3600*24)) + #df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds() / (3600*24)) + df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds()).astype(np.int64) df.sort_values(by="timestamp", inplace=True) df.reset_index(inplace=True, drop=True) df = df.groupby("user_id").filter(lambda x: len(x) >= min_interactions_per_user) @@ -52,6 +53,7 @@ def prepare_assistments12(min_interactions_per_user, remove_nan_skills): df = df[['user_id', 'item_id', 'timestamp', 'correct', "inter_id"]] df = df[df.correct.isin([0,1])] # Remove potential continuous outcomes + df['correct'] = df['correct'].astype(np.int32) # Cast outcome as int32 # Save data sparse.save_npz("data/assistments12/q_mat.npz", sparse.csr_matrix(Q_mat)) @@ -85,7 +87,8 @@ def prepare_kddcup10(data_name, min_interactions_per_user, kc_col_name, })[['user_id', 'pb_id', 'step_id' ,'correct', 'timestamp', 'kc_id']] df["timestamp"] = pd.to_datetime(df["timestamp"]) df["timestamp"] = df["timestamp"] - df["timestamp"].min() - df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds() / (3600*24)) + #df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds() / (3600*24)) + df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds()).astype(np.int64) df.sort_values(by="timestamp",inplace=True) df.reset_index(inplace=True,drop=True) df = df.groupby("user_id").filter(lambda x: len(x) >= min_interactions_per_user) @@ -132,6 +135,7 @@ def prepare_kddcup10(data_name, min_interactions_per_user, kc_col_name, df = df[['user_id', 'item_id', 'timestamp', 'correct', 'inter_id']] df = df[df.correct.isin([0,1])] # Remove potential continuous outcomes + df['correct'] = df['correct'].astype(np.int32) # Cast outcome as int32 # Save data sparse.save_npz(folder_path + "/q_mat.npz", sparse.csr_matrix(Q_mat)) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_this_queue.py b/tests/test_this_queue.py new file mode 100644 index 0000000..baec96c --- /dev/null +++ b/tests/test_this_queue.py @@ -0,0 +1,29 @@ +import unittest +from utils.this_queue import OurQueue + +# From JJ's ktm repo: https://github.com/jilljenn/ktm + +class TestOurQueue(unittest.TestCase): + + def test_simple(self): + q = OurQueue() + q.push(0) + q.push(0.8 * 3600 * 24) + q.push(5 * 3600 * 24) + q.push(40 * 3600 * 24) + self.assertEqual(q.get_counters(40 * 3600 * 24), [4, 1, 1, 1, 1]) + + def test_complex(self): + q = OurQueue() + q.push(0) + q.push(10) + q.push(3599) + q.push(3600) + q.push(3601) + q.push(3600 * 24) + q.push(3600 * 24 + 1) + q.push(3600 * 24 * 7) + q.push(3600 * 24 * 7 + 1) + q.push(3600 * 24 * 7 * 30) + q.push(3600 * 24 * 7 * 30 + 1) + self.assertEqual(q.get_counters(3600 * 24 * 7 * 30 + 1), [11, 2, 2, 2, 2]) diff --git a/utils/this_queue.py b/utils/this_queue.py new file mode 100644 index 0000000..125f4b7 --- /dev/null +++ b/utils/this_queue.py @@ -0,0 +1,30 @@ +class OurQueue: + """ + A queue for counting efficiently the number of events within time windows. + Complexity: + All operators in amortized O(W) time where W is the number of windows. + + From JJ's KTM repository: https://github.com/jilljenn/ktm. + """ + def __init__(self): + #self.now = None + self.queue = [] + self.window_lengths = [3600 * 24 * 30, 3600 * 24 * 7, 3600 * 24, 3600] + self.cursors = [0] * len(self.window_lengths) + + def __len__(self): + return len(self.queue) + + def get_counters(self, t): + self.update_cursors(t) + return [len(self.queue)] + [len(self.queue) - cursor + for cursor in self.cursors] + + def push(self, time): + self.queue.append(time) + + def update_cursors(self, t): + for pos, length in enumerate(self.window_lengths): + while (self.cursors[pos] < len(self.queue) and + t - self.queue[self.cursors[pos]] >= length): + self.cursors[pos] += 1