In [31]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [32]:
# Path to people.csv from ReadHatKaggle data set
PEOPLE_FILE_PATH='Data/people.csv'
# Path to act_train.csv from RedHatKaggle data set
ACTIVITIES_FILE_PATH='Data/act_train.csv'
# Path to test.csv from RedHatKaggle data set
TEST_DATA_FILE_PATH='Data/act_test.csv'

# For enabling one hot key encoding
ONE_HOT_KEY = True

# Columns that do not represent features
# TODO: date is not a NON-FEATURE, the preprocessing for date has not been done
NON_FEATURE=['people_people_id','people_id','activity_id','outcome','people_date','date']

In [33]:
# Function to change labels of feature columns to an encoding value e.g. Type 1, Type 2, Type 3 (String) -> 1, 2 ,3 (int)
# This is required as scikit only takes integer values
# By default all 'id' columns are not regarded as features and not encoded. (identity_column)
def category_to_label_encoding(dataset,identity_columns=['people_people_id','people_id','activity_id']):
    for column in dataset.columns:
        if column not in identity_columns:
            if (dataset[column].dtype == 'O'):
                dataset[column]=dataset[column].apply(lambda x: str(x).split(' ')[1]).astype(np.int32)
              # This converts bool to int, but scikit treats boolean as int
#           elif dataset[column].dtype == 'bool':
#                 le=LabelEncoder()
#                 le.fit(['True','False'])
#                 dataset[column]=le.transform(dataset[column])
    return dataset

In [34]:
# Function to change labels of categories to one-hot encoding using scikit's OneHot Encoding
# pd.get_dummies(df) does the same, provides sweet header's as well but it it not fast enough, kill's memory
def category_to_one_hot(dataset,non_feature=NON_FEATURE):
    boolean_column = []
#     column_names = []
    counter=0
    for column in dataset.columns:
        if column not in NON_FEATURE:
            if dataset[column].dtype == 'bool':
                counter += 1
                continue
            if dataset[column].dtype == '<M8[ns]':
                counter += 1
                continue
            boolean_column.append(counter)
#             column_names.append(column)
            counter += 1
    ds = dataset.drop(NON_FEATURE, axis=1)
    grd_enc = OneHotEncoder(categorical_features=boolean_column)
    encoded_arr=grd_enc.fit_transform(ds).toarray()
    return encoded_arr,grd_enc

In [35]:
# Read the data set's
people_df=pd.read_csv(PEOPLE_FILE_PATH,parse_dates=["date"],true_values=["True"],false_values=["False"])
activity_df=pd.read_csv(ACTIVITIES_FILE_PATH, parse_dates=["date"])
test_df=pd.read_csv(TEST_DATA_FILE_PATH,parse_dates=["date"])

In [20]:
people_df

Unnamed: 0,people_id,char_1,group_1,char_2,date,char_3,char_4,char_5,char_6,char_7,...,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,...,False,True,True,False,False,True,True,True,False,36
1,ppl_100002,type 2,group 8688,type 3,2021-01-06,type 28,type 9,type 5,type 3,type 11,...,False,True,True,True,True,True,True,True,False,76
2,ppl_100003,type 2,group 33592,type 3,2022-06-10,type 4,type 8,type 5,type 2,type 5,...,False,False,True,True,True,True,False,True,True,99
3,ppl_100004,type 2,group 22593,type 3,2022-07-20,type 40,type 25,type 9,type 4,type 16,...,True,True,True,True,True,True,True,True,True,76
4,ppl_100006,type 2,group 6534,type 3,2022-07-27,type 40,type 25,type 9,type 3,type 8,...,False,False,True,False,False,False,True,True,False,84
5,ppl_10001,type 2,group 25417,type 3,2022-10-14,type 6,type 6,type 4,type 1,type 1,...,False,True,True,True,True,True,True,True,True,90
6,ppl_100010,type 2,group 17304,type 2,2022-09-01,type 8,type 7,type 8,type 1,type 7,...,False,False,False,False,False,False,False,False,False,2
7,ppl_100013,type 2,group 4204,type 3,2023-01-24,type 4,type 8,type 4,type 1,type 7,...,False,False,True,True,True,True,False,True,True,91
8,ppl_100019,type 2,group 45749,type 3,2023-03-26,type 40,type 25,type 9,type 3,type 9,...,False,False,False,False,False,False,False,False,False,84
9,ppl_100025,type 2,group 36096,type 3,2022-08-26,type 14,type 6,type 8,type 3,type 9,...,False,False,False,False,False,False,False,False,False,76


In [21]:
activity_df

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,,,,type 76,0
1,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,,,,type 1,0
2,ppl_100,act2_3404049,2022-09-27,type 2,,,,,,,,,,type 1,0
3,ppl_100,act2_3651215,2023-08-04,type 2,,,,,,,,,,type 1,0
4,ppl_100,act2_4109017,2023-08-26,type 2,,,,,,,,,,type 1,0
5,ppl_100,act2_898576,2023-08-04,type 4,,,,,,,,,,type 1727,0
6,ppl_100002,act2_1233489,2022-11-23,type 2,,,,,,,,,,type 1,1
7,ppl_100002,act2_1623405,2022-11-23,type 2,,,,,,,,,,type 1,1
8,ppl_100003,act2_1111598,2023-02-07,type 2,,,,,,,,,,type 1,1
9,ppl_100003,act2_1177453,2023-06-28,type 2,,,,,,,,,,type 1,1


In [22]:
test_df

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10
0,ppl_100004,act1_249281,2022-07-20,type 1,type 5,type 10,type 5,type 1,type 6,type 1,type 1,type 7,type 4,
1,ppl_100004,act2_230855,2022-07-20,type 5,,,,,,,,,,type 682
2,ppl_10001,act1_240724,2022-10-14,type 1,type 12,type 1,type 5,type 4,type 6,type 1,type 1,type 13,type 10,
3,ppl_10001,act1_83552,2022-11-27,type 1,type 20,type 10,type 5,type 4,type 6,type 1,type 1,type 5,type 5,
4,ppl_10001,act2_1043301,2022-10-15,type 5,,,,,,,,,,type 3015
5,ppl_10001,act2_112890,2022-11-27,type 5,,,,,,,,,,type 4987
6,ppl_10001,act2_1169930,2022-10-15,type 5,,,,,,,,,,type 3015
7,ppl_10001,act2_1924448,2022-10-15,type 5,,,,,,,,,,type 3015
8,ppl_10001,act2_1953554,2022-10-15,type 5,,,,,,,,,,type 3015
9,ppl_10001,act2_1971739,2022-11-28,type 5,,,,,,,,,,type 3015


In [36]:
# Introduce a category for null values called category 0 since scikit needs numeric data
people_df.fillna("type 0", inplace=True)
activity_df.fillna("type 0", inplace=True)
test_df.fillna("type 0", inplace=True)

In [37]:
days = {0:'Mon',1:'Tues',2:'Weds',3:'Thurs',4:'Fri',5:'Sat',6:'Sun'}
months = {0: 'Jan', 1: 'Feb', 2: 'Mar', 3: "Apr", 4: "May", 5: "Jun", 6: "Jul", 7: "Aug", 8: "Sep", 9: "Oct", 10: "Nov", 11: "Dec"}

In [38]:
people_df['days'] = people_df['date'].dt.dayofweek.apply(lambda x: days[x])
people_df['month'] = people_df['date'].dt.month.apply(lambda x: months[x-1])
people_df['quarter'] = people_df['date'].dt.quarter
people_df['year'] = people_df['date'].dt.year
people_df['week'] = people_df['date'].dt.week
people_df['dayOfYear'] = people_df['date'].dt.day
#del people_df['date']

test_df['days'] = test_df['date'].dt.dayofweek.apply(lambda x: days[x])
test_df['month'] = test_df['date'].dt.month.apply(lambda x: months[x-1])
test_df['quarter'] = test_df['date'].dt.quarter
test_df['year'] = test_df['date'].dt.year
test_df['week'] = test_df['date'].dt.week
test_df['dayOfYear'] = test_df['date'].dt.day
#del test_df['date']

activity_df['days'] = activity_df['date'].dt.dayofweek.apply(lambda x: days[x])
activity_df['month'] = activity_df['date'].dt.month.apply(lambda x: months[x-1])
activity_df['quarter'] = activity_df['date'].dt.quarter
activity_df['year'] = activity_df['date'].dt.year
activity_df['week'] = activity_df['date'].dt.week
activity_df['dayOfYear'] = activity_df['date'].dt.day
#del activity_df['date']


In [39]:
# Rename columns under people_df to avoid same-names in the three dataframe
people_df = people_df.rename(columns=lambda x : "people_" + x)
test_df = test_df.rename(columns=lambda x : "test_" + x)

In [41]:
# Merge activity and test data frame with people_df
train_dataset = pd.merge(people_df, activity_df, how='right', left_on='people_people_id', right_on="people_id")
test_dataset = pd.merge(test_df, activity_df, how='right', left_on='test_people_id', right_on="people_id")

In [42]:
# clearing memory
del people_df
del activity_df
del test_df

# Not using the official label encoder because it is beneficial to have type 7 -> 7
# The label encoder might assign a value '6' to type 7 based on the order in which it appears in the list
# Sorting might improve it but sometimes values are missing from the type
# DOUBT: not sure if continuous encoding values are a requirement ?
train_dataset = category_to_label_encoding(train_dataset)
test_dataset = category_to_label_encoding(train_dataset)

IndexError: list index out of range

In [11]:
# for train_dataset, similar can be done for test_dataset if needed
dataset = train_dataset
if ONE_HOT_KEY:
    encoded_array, grd_enc=category_to_one_hot(dataset)
    print (encoded_array)
    # TODO: NOT SURE HOW WE FIND OUT THE LABEL NAMES OR IF THEY EVEN MATTER
    print (grd_enc.active_features_)
    print (grd_enc.n_values_)
    print (grd_enc.feature_indices_)
else:
    numeric_headers = list(dataset.drop(NON_FEATURE, axis=1).columns.values)
    numpy_array = dataset.drop(NON_FEATURE, axis=1).as_matrix()
    print (numpy_array)
    

[[ 0.  1.  0. ...,  1.  1.  0.]
 [ 0.  1.  0. ...,  1.  1.  0.]
 [ 0.  1.  0. ...,  1.  1.  0.]
 ..., 
 [ 0.  1.  0. ...,  1.  1.  1.]
 [ 0.  1.  0. ...,  1.  1.  1.]
 [ 0.  1.  0. ...,  1.  1.  1.]]
[    1     2     4 ..., 61124 61128 61132]
[    3 51463     4    45    26    10     8    26     9    10   101     8
    53    33    12     8     8     6     9    19    20  9252]
[    0     3 51466 51470 51515 51541 51551 51559 51585 51594 51604 51705
 51713 51766 51799 51811 51819 51827 51833 51842 51861 51881 61133]


In [43]:
train_dataset

Unnamed: 0,people_people_id,people_char_1,people_group_1,people_char_2,people_date,people_char_3,people_char_4,people_char_5,people_char_6,people_char_7,...,char_8,char_9,char_10,outcome,days,month,quarter,year,week,dayOfYear
0,ppl_100,2,17304,2,2021-06-29,5,5,5,3,11,...,type 0,type 0,type 76,0,Sat,Aug,3,2023,34,238
1,ppl_100,2,17304,2,2021-06-29,5,5,5,3,11,...,type 0,type 0,type 1,0,Tues,Sep,3,2022,39,270
2,ppl_100,2,17304,2,2021-06-29,5,5,5,3,11,...,type 0,type 0,type 1,0,Tues,Sep,3,2022,39,270
3,ppl_100,2,17304,2,2021-06-29,5,5,5,3,11,...,type 0,type 0,type 1,0,Fri,Aug,3,2023,31,216
4,ppl_100,2,17304,2,2021-06-29,5,5,5,3,11,...,type 0,type 0,type 1,0,Sat,Aug,3,2023,34,238
5,ppl_100,2,17304,2,2021-06-29,5,5,5,3,11,...,type 0,type 0,type 1727,0,Fri,Aug,3,2023,31,216
6,ppl_100002,2,8688,3,2021-01-06,28,9,5,3,11,...,type 0,type 0,type 1,1,Weds,Nov,4,2022,47,327
7,ppl_100002,2,8688,3,2021-01-06,28,9,5,3,11,...,type 0,type 0,type 1,1,Weds,Nov,4,2022,47,327
8,ppl_100003,2,33592,3,2022-06-10,4,8,5,2,5,...,type 0,type 0,type 1,1,Tues,Feb,1,2023,6,38
9,ppl_100003,2,33592,3,2022-06-10,4,8,5,2,5,...,type 0,type 0,type 1,1,Weds,Jun,2,2023,26,179


In [15]:
# TODO Add code to convert date to Day, Month, Year columns
# There are two date columns: 'date' and 'people_date' from activities.csv and people.csv
# Kaggle had some people also analysing the date if weekend, so another column for Weekend 1,0 (True, False) can be done