In [29]:
import pandas as pd     
behaviors_data = pd.read_csv(
    "MINDsmall_train/behaviors.tsv",
    sep='\t',
    names=["impressionId", "userId", "timestamp", "click_history", "impressions"],
    parse_dates=['timestamp'] # This line parses the 'timestamp' column into datetime objects
)
df = behaviors_data

df.set_index("timestamp", inplace = True)
df = df.sort_values("timestamp") #entire df is now indexed by timestamp, and in a sorted order

# df["2019-11-09" : "2019-11-10"] to access a range
# df.loc["2019-11-14"] to access a specific date or period, could also use df.loc["2019"] to get everything in 2019

df

Unnamed: 0_level_0,impressionId,userId,click_history,impressions
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-11-09 00:00:19,20112,U65916,N51706 N40767 N12096 N9798 N38802 N54827 N5780...,N54300-0 N46057-1 N57005-0 N52154-0 N57099-0 N...
2019-11-09 00:01:13,13807,U49985,N5056 N29975 N53234 N39603 N50032 N8422 N53580...,N20602-0 N50059-0 N57768-1 N50135-1 N15134-0 N...
2019-11-09 00:02:44,27660,U25550,N17260 N38298 N33976 N47719 N14888 N18870 N4607,N50135-0 N15134-0 N52433-1 N20602-0 N64536-0
2019-11-09 00:02:50,152217,U19710,N3530 N48284 N43019 N62546 N138 N13138 N10676 ...,N57099-0 N30295-0 N21086-0 N5379-0 N57005-0 N4...
2019-11-09 00:03:09,42166,U38106,N16874 N264 N48697 N51366,N3491-0 N20602-0 N25785-0 N23575-0 N38783-0 N1...
...,...,...,...,...
2019-11-14 23:58:46,66234,U717,N54822 N46392 N27863 N13138 N40448 N14006,N7494-0 N46917-0 N62197-0 N2960-0 N22978-0 N57...
2019-11-14 23:58:47,36004,U44395,N38488 N11231 N14761 N21164 N42128 N7328 N3750...,N48487-0 N41934-0 N64037-0 N63913-0 N55322-0 N...
2019-11-14 23:58:51,105363,U41595,,N14478-0 N7342-0 N48487-0 N29490-0 N27737-0 N4...
2019-11-14 23:59:06,108433,U75895,N1300 N9803 N14114 N31996,N29490-0 N22975-0 N27737-0 N6837-0 N47652-0 N1...


In [30]:
#df["2019-11-09 00:01:13" : "2019-11-10 05:12:00"]
df.index[0]

Timestamp('2019-11-09 00:00:19')

In [33]:
def window_split(df: pd.DataFrame, start_date_time: str, num_of_days: int, num_test_days: int = 1):
    start_timestamp = pd.to_datetime(start_date_time)
    

    # Last day will be made the test set, and the rest will be the training set
    # if num of days = 3, train set will be from day 1 and 2, and test set will be day 3
    train_df = df[start_timestamp : start_timestamp + pd.DateOffset(days=num_of_days - num_test_days)]
    test_df = df[start_timestamp + pd.DateOffset(days=num_of_days - num_test_days) : start_timestamp + pd.DateOffset(days=num_of_days)]
    return (train_df, test_df)


train_df, test_df = window_split(df, "2019-11-09", 4, 2)

In [34]:
train_df

Unnamed: 0_level_0,impressionId,userId,click_history,impressions
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-11-09 00:00:19,20112,U65916,N51706 N40767 N12096 N9798 N38802 N54827 N5780...,N54300-0 N46057-1 N57005-0 N52154-0 N57099-0 N...
2019-11-09 00:01:13,13807,U49985,N5056 N29975 N53234 N39603 N50032 N8422 N53580...,N20602-0 N50059-0 N57768-1 N50135-1 N15134-0 N...
2019-11-09 00:02:44,27660,U25550,N17260 N38298 N33976 N47719 N14888 N18870 N4607,N50135-0 N15134-0 N52433-1 N20602-0 N64536-0
2019-11-09 00:02:50,152217,U19710,N3530 N48284 N43019 N62546 N138 N13138 N10676 ...,N57099-0 N30295-0 N21086-0 N5379-0 N57005-0 N4...
2019-11-09 00:03:09,42166,U38106,N16874 N264 N48697 N51366,N3491-0 N20602-0 N25785-0 N23575-0 N38783-0 N1...
...,...,...,...,...
2019-11-10 23:58:22,50300,U67472,N25113,N35729-0 N56193-1
2019-11-10 23:59:20,42104,U73545,N19644 N3395 N46963 N14356 N24492,N27581-0 N36621-0 N48606-0 N15931-0 N30725-0 N...
2019-11-10 23:59:24,18856,U6775,N48304 N13754 N28572 N306 N5905,N29128-0 N17307-0 N28047-0 N7104-0 N45076-0 N5...
2019-11-10 23:59:29,35190,U49532,N8548 N5227 N46911 N41089 N52049 N6233 N3508 N...,N64513-0 N23184-0 N53111-0 N31273-1 N21882-0 N...


In [35]:
test_df

Unnamed: 0_level_0,impressionId,userId,click_history,impressions
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-11-11 00:00:10,68864,U62984,N23105 N57399 N13081,N56193-0 N48759-1 N35729-0 N31273-0 N49685-0 N...
2019-11-11 00:00:29,17067,U78814,N30344 N6385 N14538 N7201 N30665 N51957 N16695...,N27581-0 N50060-0 N11830-0 N52446-0 N8855-0 N2...
2019-11-11 00:00:32,52457,U19572,N17981 N10779 N15861 N1954 N26136 N36133 N5015...,N49685-0 N57906-0 N64542-0 N43073-0 N27486-1 N...
2019-11-11 00:01:02,70602,U17960,N10059 N46039 N57582 N64554 N31801 N25761 N242...,N35729-0 N56193-1
2019-11-11 00:01:15,62606,U23479,N56586 N35120 N39374 N21576 N11508 N8569 N5902...,N56193-0 N48759-0 N49685-0 N35729-1
...,...,...,...,...
2019-11-12 23:56:02,131262,U57649,N44833 N5578 N56307 N48596,N63372-0 N62621-0 N62208-0 N12714-0 N55132-0 N...
2019-11-12 23:56:59,87197,U2906,N51591 N19016 N25740 N459 N4834 N21242 N51892 ...,N12731-0 N52535-0 N18887-0 N26025-0 N21925-1 N...
2019-11-12 23:58:09,8713,U30632,N65039 N51076 N52067 N50578 N57888 N60412 N236...,N36436-0 N64152-0 N39317-0 N29822-0 N24157-0 N...
2019-11-12 23:58:37,145679,U11622,N49944 N38963 N46911 N26136 N63248 N37720 N194...,N48019-0 N17031-0 N1940-0 N23942-0 N36184-0 N6...
