In [164]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [165]:
train = pd.read_csv('data/train.csv', index_col=0)
test = pd.read_csv('data/test.csv', index_col=0)

train["ts"] = pd.to_datetime(train["ts"])
test["ts"] = pd.to_datetime(test["ts"])

In [166]:
#1st transformation
#Adding dummies for gates
class AddingGatesDummies(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self  

    def transform(self, X):
        X = X.copy()
        gates = pd.get_dummies(X['gate_id'])

        for i in range(-1, 17):
            X["gate_" + str(i)] = 0

        gates.columns = ["gate_" + str(col) for col in gates.columns]
        X[gates.columns] = gates[gates.columns]

        return X

In [167]:
#2nd transformation
#Adding dummies for days of week

class DaysOfWeekDummies(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self  

    def transform(self, X):
        X = X.copy()
        days_of_week = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
        X[days_of_week] = pd.get_dummies(X['ts'].dt.day_name())[days_of_week]

        return X



In [168]:
#3rd transformation
#Time

class TimeFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self  

    def transform(self, X):
        X = X.copy()

        X['hour'] = X['ts'].dt.hour
        X['minute'] = X['ts'].dt.minute
        X['day'] = X['ts'].dt.day
        X['month'] = X['ts'].dt.month

        for i in range(0, 24):
            X["h_" + str(i)] = 0

        hours = pd.get_dummies(X['ts'].dt.hour)
        hours.columns = ["h_" + str(col) for col in hours.columns]
        X[hours.columns] = hours[hours.columns]

        return X



In [169]:
# 4th transformation
# Working and not working days


class WeekendFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self  

    def transform(self, X):
        X["date"] = X["ts"].dt.date
        count = X.groupby("date").size().reset_index(name='count')
        result = pd.merge(X, count, on='date', how='left')

        result["is_weekend"] = result["count"] < 100
        result.drop(["count", "date"], axis=1, inplace=True)

        X = result

        return X

In [170]:
# 5th transformation
# Gate n-grams

class GateNGrams(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self  

    def transform(self, X):
        X = X.copy()
        
        X["gate_half"] = X["gate_id"] <= 8

        for i in range(-1, 12):
            X["triplet_" + str(i+1)] = (X["gate_id"] >= i) & (X["gate_id"] <= i+2)
 

        return X



In [171]:
# 6th transformation
# Estimate working, departure and arrival time

class UserTimeMetrics(BaseEstimator, TransformerMixin):
    def __init__(self, train):
        if train:
            self.id = "user_id"
        else:
            self.id = "user_word"

    def fit(self, X, y=None):
        stats = X.groupby([self.id, "month", "day"]).agg(["min", "max"])["ts"]
        stats["diff"] = stats["max"] - stats["min"]

        self.stats = stats
        self.stats.rename({"max": "departure", "min": "arrival"}, axis=1, inplace=True)
        return self  

    def transform(self, X):
        X = X.copy()
        
        X = pd.merge(X, self.stats, how="left", on=[self.id, "month", "day"])
        return X

In [172]:
# 7th transformation
# Transform times from 6th point

class TransformTimeMetrics(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self  

    def transform(self, X):
        X = X.copy()

        X["diff_hours"] = X["diff"].dt.total_seconds() // 3600
        X["diff_min"] = X["diff"].dt.seconds % 3600 // 60

        X["arrival_time"] = X["arrival"].dt.time
        X["arrival_hour"] = X["arrival"].dt.hour
        X["arrival_min"] = X["arrival"].dt.minute

        X["departure_time"] = X["departure"].dt.time
        X["departure_hour"] = X["departure"].dt.hour
        X["departure_min"] =  X["departure"].dt.minute

        X.drop(["diff", "arrival", "departure"], axis=1, inplace=True)
        
        return X

In [173]:
# 8th transformation
# Stats on user metrics

def time_to_seconds(t):
    return t.hour * 3600 + t.minute * 60 + t.second

def seconds_to_time(s):
    return pd.to_timedelta(s, unit='s')

class UserTimeMetricsStats(BaseEstimator, TransformerMixin):
    def __init__(self, train):
        if train:
            self.id = "user_id"
        else:
            self.id = "user_word"

    def fit(self, X, y=None):
        stats = X.groupby([self.id]).agg(        
            average_arrival=('arrival_time', lambda x: seconds_to_time(np.mean([time_to_seconds(t) for t in x]))),
            std_arrival=('arrival_time', lambda x: seconds_to_time(np.std([time_to_seconds(t) for t in x]))),
            average_departure=('departure_time', lambda x: seconds_to_time(np.mean([time_to_seconds(t) for t in x]))),
            std_departure=('departure_time', lambda x: seconds_to_time(np.std([time_to_seconds(t) for t in x])))
        )
        
        self.stats = stats
        return self  

    def transform(self, X):
        X = X.copy()
        
        X = pd.merge(X, self.stats, how="left", on=[self.id])
        X["average_arrival"] = X["average_arrival"].dt.seconds // 60
        X["std_arrival"] = X["std_arrival"].dt.seconds // 60

        X["average_departure"] = X["average_departure"].dt.seconds // 60
        X["std_departure"] = X["std_departure"].dt.seconds // 60

        X.drop(["arrival_time", "departure_time"], axis=1, inplace=True)
        return X

In [174]:
# 9th transformation
# Gates stats

class GatesStats(BaseEstimator, TransformerMixin):
    def __init__(self, train):
        if train:
            self.id = "user_id"
        else:
            self.id = "user_word"

    def fit(self, X, y=None):
        X=X.copy()
        X["gates_today"] = X.groupby([self.id, "month", "day"])["ts"].transform('count')
        self.stats = X.groupby([self.id]).agg(["mean", "std"])["gates_today"]
        self.gates_stats = X["gates_today"]
        return self  

    def transform(self, X):
        X = X.copy()

        self.stats.rename({"mean": "average_gates", "std": "std_gates"}, axis=1, inplace=True)
        X = pd.merge(X, self.stats, how="left", on=[self.id])
        X["gates_stats"] = self.gates_stats

        return X

### Make a pipeline

In [175]:
def create_pipeline(train):
    pipeline = Pipeline([
        ("Gate_dummies", AddingGatesDummies()),
        ("Handling_day_of_week", DaysOfWeekDummies()),
        ("Handling_time_features", TimeFeatures()),
        ("Handling_weekends", WeekendFeatures()),
        ("Gate_ngrams", GateNGrams()),
        ("User_times_metrics", UserTimeMetrics(train=train)),
        ("Transform_time_metrics", TransformTimeMetrics()),
        ("Stats_on_user_metrics", UserTimeMetricsStats(train=train)),
        ("Gates_stats", GatesStats(train=train)),
    ])
    return pipeline

train_pipeline = create_pipeline(train=True)
test_pipeline = create_pipeline(train=False)

In [176]:
train = train_pipeline.fit_transform(train)
test = test_pipeline.fit_transform(test)

### Save preprocessed data

In [177]:
train.to_csv("./data/train_preprocessed.csv", index=False)
test.to_csv("./data/test_preprocessed.csv", index=False)