In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Path to people.csv from ReadHatKaggle data set
PEOPLE_FILE_PATH='../people.csv'
# Path to act_train.csv from RedHatKaggle data set
ACTIVITIES_FILE_PATH='../act_train.csv'
# Path to test.csv from RedHatKaggle data set
TEST_DATA_FILE_PATH='../act_test.csv'

# For enabling one hot key encoding
ONE_HOT_KEY = True

# Columns that do not represent features
# TODO: date is not a NON-FEATURE, the preprocessing for date has not been done
NON_FEATURE=['people_people_id','people_id','activity_id','outcome','people_date','date']

In [3]:
# Function to change labels of feature columns to an encoding value e.g. Type 1, Type 2, Type 3 (String) -> 1, 2 ,3 (int)
# This is required as scikit only takes integer values
# By default all 'id' columns are not regarded as features and not encoded. (identity_column)
def category_to_label_encoding(dataset,identity_columns=['people_people_id','people_id','activity_id']):
    for column in dataset.columns:
        if column not in identity_columns:
            if (dataset[column].dtype == 'O'):
                dataset[column]=dataset[column].apply(lambda x: str(x).split(' ')[1]).astype(np.int32)
              # This converts bool to int, but scikit treats boolean as int
#           elif dataset[column].dtype == 'bool':
#                 le=LabelEncoder()
#                 le.fit(['True','False'])
#                 dataset[column]=le.transform(dataset[column])
    return dataset

In [4]:
# Function to change labels of categories to one-hot encoding using scikit's OneHot Encoding
# pd.get_dummies(df) does the same, provides sweet header's as well but it it not fast enough, kill's memory
def category_to_one_hot(dataset,non_feature=NON_FEATURE):
    boolean_column = []
#     column_names = []
    counter=0
    for column in dataset.columns:
        if column not in NON_FEATURE:
            if dataset[column].dtype == 'bool':
                counter += 1
                continue
            if dataset[column].dtype == '<M8[ns]':
                counter += 1
                continue
            boolean_column.append(counter)
#             column_names.append(column)
            counter += 1
    ds = dataset.drop(NON_FEATURE, axis=1)
    grd_enc = OneHotEncoder(categorical_features=boolean_column)
    encoded_arr=grd_enc.fit_transform(ds).toarray()
    return encoded_arr,grd_enc

In [5]:
# Read the data set's
people_df=pd.read_csv(PEOPLE_FILE_PATH,parse_dates=["date"],true_values=["True"],false_values=["False"])
activity_df=pd.read_csv(ACTIVITIES_FILE_PATH, parse_dates=["date"])
test_df=pd.read_csv(TEST_DATA_FILE_PATH,parse_dates=["date"])

In [6]:
# Introduce a category for null values called category 0 since scikit needs numeric data
people_df.fillna("type 0", inplace=True)
activity_df.fillna("type 0", inplace=True)
test_df.fillna("type 0", inplace=True)

In [7]:
# Rename columns under people_df to avoid same-names in the three dataframe
people_df = people_df.rename(columns=lambda x : "people_"+x)

In [8]:
# Merge activity and test data frame with people_df
train_dataset = pd.merge(people_df, activity_df, how='right', left_on='people_people_id', right_on="people_id")
test_dataset = pd.merge(people_df, activity_df, how='right', left_on='people_people_id', right_on="people_id")

In [9]:
# clearing memory
del people_df
del activity_df
del test_df

# Not using the official label encoder because it is beneficial to have type 7 -> 7
# The label encoder might assign a value '6' to type 7 based on the order in which it appears in the list
# Sorting might improve it but sometimes values are missing from the type
# DOUBT: not sure if continuous encoding values are a requirement ?
train_dataset = category_to_label_encoding(train_dataset)
test_dataset = category_to_label_encoding(train_dataset)

In [11]:
# for train_dataset, similar can be done for test_dataset if needed
dataset = train_dataset
if ONE_HOT_KEY:
    encoded_array, grd_enc=category_to_one_hot(dataset)
    print (encoded_array)
    # TODO: NOT SURE HOW WE FIND OUT THE LABEL NAMES OR IF THEY EVEN MATTER
    print (grd_enc.active_features_)
    print (grd_enc.n_values_)
    print (grd_enc.feature_indices_)
else:
    numeric_headers = list(dataset.drop(NON_FEATURE, axis=1).columns.values)
    numpy_array = dataset.drop(NON_FEATURE, axis=1).as_matrix()
    print (numpy_array)
    

[[ 0.  1.  0. ...,  1.  1.  0.]
 [ 0.  1.  0. ...,  1.  1.  0.]
 [ 0.  1.  0. ...,  1.  1.  0.]
 ..., 
 [ 0.  1.  0. ...,  1.  1.  1.]
 [ 0.  1.  0. ...,  1.  1.  1.]
 [ 0.  1.  0. ...,  1.  1.  1.]]
[    1     2     4 ..., 61124 61128 61132]
[    3 51463     4    45    26    10     8    26     9    10   101     8
    53    33    12     8     8     6     9    19    20  9252]
[    0     3 51466 51470 51515 51541 51551 51559 51585 51594 51604 51705
 51713 51766 51799 51811 51819 51827 51833 51842 51861 51881 61133]


In [10]:
train_dataset

Unnamed: 0,people_people_id,people_char_1,people_group_1,people_char_2,people_date,people_char_3,people_char_4,people_char_5,people_char_6,people_char_7,...,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
0,ppl_100,2,17304,2,2021-06-29,5,5,5,3,11,...,0,0,0,0,0,0,0,0,76,0
1,ppl_100,2,17304,2,2021-06-29,5,5,5,3,11,...,0,0,0,0,0,0,0,0,1,0
2,ppl_100,2,17304,2,2021-06-29,5,5,5,3,11,...,0,0,0,0,0,0,0,0,1,0
3,ppl_100,2,17304,2,2021-06-29,5,5,5,3,11,...,0,0,0,0,0,0,0,0,1,0
4,ppl_100,2,17304,2,2021-06-29,5,5,5,3,11,...,0,0,0,0,0,0,0,0,1,0
5,ppl_100,2,17304,2,2021-06-29,5,5,5,3,11,...,0,0,0,0,0,0,0,0,1727,0
6,ppl_100002,2,8688,3,2021-01-06,28,9,5,3,11,...,0,0,0,0,0,0,0,0,1,1
7,ppl_100002,2,8688,3,2021-01-06,28,9,5,3,11,...,0,0,0,0,0,0,0,0,1,1
8,ppl_100003,2,33592,3,2022-06-10,4,8,5,2,5,...,0,0,0,0,0,0,0,0,1,1
9,ppl_100003,2,33592,3,2022-06-10,4,8,5,2,5,...,0,0,0,0,0,0,0,0,1,1


In [15]:
# TODO Add code to convert date to Day, Month, Year columns
# There are two date columns: 'date' and 'people_date' from activities.csv and people.csv
# Kaggle had some people also analysing the date if weekend, so another column for Weekend 1,0 (True, False) can be done