In [1]:
import pandas as pd
import numpy as np

import os
cur_path = os.getcwd()

In [2]:
TRAIN_PATH = "train.csv"
SAVE_PATH = cur_path + "/prepared_data/"

In [3]:
def createPreparedFeatures(df:pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(by=['session_id', 'elapsed_time'])
    df["full_event_name"] = df["name"].astype("str") + "_" + df["event_name"].astype("str")

    df["absolute_coor_x"] = df["room_coor_x"] + df["screen_coor_x"]
    df["absolute_coor_y"] = df["room_coor_y"] + df["screen_coor_y"]

    df["delta_absolute_coor_x"] = df["absolute_coor_x"].diff(1).fillna(0)
    df["delta_absolute_coor_y"] = df["absolute_coor_y"].diff(1).fillna(0)

    df["radius_absolute_coor"] = np.sqrt(df["absolute_coor_x"]**2 + df["absolute_coor_y"]**2)
    df["tangent_absolute_coor"] = df["absolute_coor_y"]/df["absolute_coor_x"]

    df["delta_radius_absolute_coor"] = np.sqrt(df["delta_absolute_coor_x"]**2 + df["delta_absolute_coor_y"]**2)
    df["delta_tangent_absolute_coor"] = df["delta_absolute_coor_y"]/df["delta_absolute_coor_x"]

    df["page"] = df["page"].fillna(-1) + 1
    df["hover_duration"] = df["hover_duration"].fillna(0)
    return df

In [4]:
dtypes = {"session_id": "int64",
        "index": np.int16,
        "elapsed_time": np.int32,
        "event_name": "category",
        "name": "category",
        "level": np.int8,
        "page": np.float32,
        "room_coor_x": np.float32,
        "room_coor_y": np.float32,
        "screen_coor_x": np.float32,
        "screen_coor_y": np.float32,
        "hover_duration": np.float32,
        "text": "category",
        "fqid": "category",
        "room_fqid": "category",
        "text_fqid": "category",
        "fullscreen": np.int8,
        "hq": np.int8,
        "music": np.int8,
        "level_group": "category"
        }


use_col = ["session_id",
        #   "index",
        "elapsed_time",
        "event_name",
        "name",
        "level",
        "page",
        "room_coor_x",
        "room_coor_y",
        "screen_coor_x",
        "screen_coor_y",
        "hover_duration",
        "text",
        # "fqid",
        # "room_fqid",
        # "text_fqid",
        # "fullscreen",
        # "hq",
        # "music",
        "level_group"
        ]

train_raw = pd.read_csv(TRAIN_PATH, dtype=dtypes, usecols=use_col)
train_raw = createPreparedFeatures(train_raw)

#Saving train file after applying create_features and remove_outliers
train_raw[train_raw["level_group"] == "0-4"].drop("level_group", axis=1).to_pickle(SAVE_PATH + "train_0_4.pkl")
print("Saving train file with level group 0-4 done!")

train_raw[train_raw["level_group"] == "5-12"].drop("level_group", axis=1).to_pickle(SAVE_PATH + "train_5_12.pkl")
print("Saving train file with level group 5-12 done!")

train_raw[train_raw["level_group"] == "13-22"].drop("level_group", axis=1).to_pickle(SAVE_PATH + "train_13_22.pkl")
print("Saving train file with level group 13-22 done!")

Saving train file with level group 0-4 done!
Saving train file with level group 5-12 done!
Saving train file with level group 13-22 done!
