# Creating a Holdout Data Set for the Nairobi Ambulance Challenge

## Importing packages

In [1]:
import pandas as pd
import holidays

## Reading data

In [2]:
df = pd.read_csv('../Inputs/Train.csv', parse_dates=['datetime'])
print(df.shape)
df.head()

(6318, 4)


Unnamed: 0,uid,datetime,latitude,longitude
0,1,2018-01-01 00:25:46,-1.18885,36.931382
1,2,2018-01-01 02:02:39,-0.662939,37.20873
2,3,2018-01-01 02:31:49,-0.662939,37.20873
3,4,2018-01-01 03:04:01,-1.288087,36.826583
4,5,2018-01-01 03:58:49,-1.18885,36.931382


## Feature engineering

In [3]:
def create_accident_features(data):
    dict_months = {1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr", 5: "May", 6: "Jun",
               7: "Jul", 8: "Aug", 9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"}
    
    data["date"] = data["datetime"].apply(lambda x: x.date())
    data["holiday"] = data["date"].apply(lambda x: 1 if (x in holidays.Kenya()) else 0)
    data["time"] = data["datetime"].apply(lambda x: x.time())
    data["day"] = data["datetime"].apply(lambda x: x.day)
    data["weekday"] = data["datetime"].apply(lambda x: x.weekday())
    data["weekend"] = data["weekday"].apply(lambda x: 1 if x>4 else 0)
    data["month"] = data["datetime"].apply(lambda x: dict_months.get(x.month))
    data["half_year"] = data["datetime"].apply(lambda x: 1 if x.month<7 else 2)
    data["year"] = data["datetime"].apply(lambda x: x.year)
    
    return data

In [4]:
df = create_accident_features(df)
df.head()

Unnamed: 0,uid,datetime,latitude,longitude,date,holiday,time,day,weekday,weekend,month,half_year,year
0,1,2018-01-01 00:25:46,-1.18885,36.931382,2018-01-01,1,00:25:46,1,0,0,Jan,1,2018
1,2,2018-01-01 02:02:39,-0.662939,37.20873,2018-01-01,1,02:02:39,1,0,0,Jan,1,2018
2,3,2018-01-01 02:31:49,-0.662939,37.20873,2018-01-01,1,02:31:49,1,0,0,Jan,1,2018
3,4,2018-01-01 03:04:01,-1.288087,36.826583,2018-01-01,1,03:04:01,1,0,0,Jan,1,2018
4,5,2018-01-01 03:58:49,-1.18885,36.931382,2018-01-01,1,03:58:49,1,0,0,Jan,1,2018


## Splitting off a holdout set

In [5]:
def split_accident_data(data, strategy, test_size=0.3, random_state=42):
    '''
    Splits the data set into a train and a test set.
    strategy:
        random = splits off random indices, using test_size and random_state parameters
        year_2019 = splits the days of 2019 off into a test set
        percentage_month = splits off the last days of every month to the test set according to the test_size
        2nd_half_2018 = oversamples the months from July to December 2018 by about 33%
    '''

    if strategy == "random":
        data = data.sample(frac=1, random_state=random_state).reset_index().drop("index", axis=1)
        split_at = round(data.shape[0] * test_size)
        data_train = data.iloc[split_at:, :]
        data_test = data.iloc[:split_at, :]
    elif strategy == "year_2019":
        data_train = data[data["datetime"] < "2019-01-01"]
        data_test = data[data["datetime"] >= "2019-01-01"]
    elif strategy == "percentage_month":
        split_at = round(30 * (1-test_size))
        data_train = data[data["day"] <= split_at]
        data_test = data[data["day"] > split_at]
    elif strategy == "2nd_half_2018":
        train_samples = round(data.shape[0] * (1-test_size))
        test_samples = round(data.shape[0] * test_size)
        data_train = data.sample(n=train_samples, weights="half_year", random_state=random_state)
        data_test = data.sample(n=test_samples, weights="half_year", random_state=random_state)
        
    return data_train, data_test

In [6]:
# Split accident data
df_train, df_test = split_accident_data(data=df, strategy="2nd_half_2018", test_size=0.3)

print("Train:")
print(df_train.head(3)["datetime"])
print(df_train.tail(3)["datetime"])
print(df_train.shape)

print("Test:")
print(df_test.head(3)["datetime"])
print(df_test.tail(3)["datetime"])
print(df_test.shape)

Train:
2760   2018-07-28 04:19:24
5908   2019-05-14 17:16:37
4244   2018-12-14 10:28:09
Name: datetime, dtype: datetime64[ns]
1388   2018-04-20 20:22:28
4659   2019-01-31 08:04:39
5573   2019-04-14 08:35:37
Name: datetime, dtype: datetime64[ns]
(4423, 13)
Test:
2760   2018-07-28 04:19:24
5908   2019-05-14 17:16:37
4244   2018-12-14 10:28:09
Name: datetime, dtype: datetime64[ns]
4142   2018-12-07 17:36:09
513    2018-02-07 10:38:09
1318   2018-04-13 06:43:58
Name: datetime, dtype: datetime64[ns]
(1895, 13)


In [7]:
print(df_train["half_year"].value_counts())
print(df_test["half_year"].value_counts())

1    2719
2    1704
Name: half_year, dtype: int64
1    1069
2     826
Name: half_year, dtype: int64
