In [25]:
import numpy as np
import pandas as pd
import time
from sklearn.preprocessing import OneHotEncoder

In [26]:
# Path to people.csv from ReadHatKaggle data set
FEATURE_FILE ='kaggle-predicting-Red-Hat-Business-Value/Data/act_train_features.csv'
# Path to act_train.csv from RedHatKaggle data set
OUTPUT ='kaggle-predicting-Red-Hat-Business-Value/Data/act_train_output.csv'
# Path to the test file
TEST_FILE = 'kaggle-predicting-Red-Hat-Business-Value/Data/act_test_features.csv'

# Path to the files with reduced dimesions
SAVE_AS_DIR = 'kaggle-predicting-Red-Hat-Business-Value/Data'

In [27]:
# Non feature
NON_FEATURE=['activity_id','people_id','date','people_date']

# Categorical data that is only label encoded
CATEGORICAL_DATA = ['people_char_1', 'people_char_2',
                    'people_char_3', 'people_char_4', 'people_char_5',
                    'people_char_6', 'people_char_7', 'people_char_8',
                    'people_char_9', 'activity_category',
                    'char_1', 'char_2', 'char_3', 'char_4', 'char_5', 'char_6',
                    'char_7', 'char_8', 'char_9','people_group_1', 'char_10']

# Already in a one-hot encoded form
CATEGORICAL_BINARY = ['people_char_10', 'people_char_11', 'people_char_12',
                      'people_char_13', 'people_char_14', 'people_char_15',
                      'people_char_16', 'people_char_17', 'people_char_18',
                      'people_char_19', 'people_char_20', 'people_char_21',
                      'people_char_22', 'people_char_23', 'people_char_24',
                      'people_char_25', 'people_char_26', 'people_char_27',
                      'people_char_28', 'people_char_29', 'people_char_30',
                      'people_char_31', 'people_char_32', 'people_char_33',
                      'people_char_34', 'people_char_35', 'people_char_36',
                      'people_char_37' ]

# Continuous categories
CONT = ['people_days', 'days',
      'people_month',  'month', 
      'people_quarter', 'quarter',
      'people_week', 'week',
      'people_dayOfMonth', 'dayOfMonth',
      'people_year', 'year', 
      'people_char_38', ]

In [28]:
# Remove redundant values
def remove_redundant(df_train,df_test,column,replacement):
    Intersection_test_train=list(\
                                 set(df_test[column].astype('int64').unique())\
                                 .intersection\
                                 (set(df_train[column].astype('int64').unique())))
    df_train[column]=df_train[column].apply(lambda x: replacement if x not in Intersection_test_train else x)
    df_test[column]=df_test[column].apply(lambda x: replacement if x not in Intersection_test_train else x)
    return df_train,df_test

In [29]:
# Function to change labels of categories to one-hot encoding using scikit's OneHot Encoding
# pd.get_dummies(df) does the same, provides sweet header's as well but it it not fast enough, kill's memory
def category_to_one_hot(dataset,non_feature,continuous_feature):
    ds = dataset.drop(non_feature,axis=1)
    boolean_column = []
    counter=0
    for column in ds.columns:
        if column not in continuous_feature:
            boolean_column.append(counter)
        counter += 1
    # boolean_colum is not the column name but index
    print ("Done filtering columns...")
    grd_enc = OneHotEncoder(categorical_features=boolean_column)
    encoded_arr=grd_enc.fit_transform(ds).toarray()
    return encoded_arr

In [30]:
# Read the train data set
train_data_df=pd.read_csv(FEATURE_FILE,parse_dates=["date"])
train_data_df.sort_values(by=['activity_id'],ascending=True, inplace=True)

In [31]:
# Read the train data output
train_output = pd.read_csv(OUTPUT)
train_output.sort_values(by='activity_id',ascending=True, inplace=True)

In [32]:
# Read the test data set
test_data_df=pd.read_csv(TEST_FILE,parse_dates=["date"])


In [33]:
test_data_df.sort_values(by=['activity_id'],ascending=True, inplace=True)

In [34]:
train_data_df.set_index('people_group_1',inplace=True)
train_data_df.drop(17304,axis=0,inplace=True)
train_data_df.reset_index(inplace=True)

In [35]:
test_data_df.set_index('people_group_1',inplace=True)
the_group=test_data_df.loc[17304]
test_data_df.drop(17304,axis=0,inplace=True)
test_data_df.reset_index(inplace=True)

In [36]:
# Function to help reduce exploding dimensions 1336.28923082 secs
start=time.time()
for column in CATEGORICAL_DATA:
    train_data_df,test_data_df=remove_redundant(train_data_df,test_data_df,column,99999)
end=time.time()
print(end-start)

919.300649166


In [37]:
# Function to one hot encode all values
start=time.time()
train_arr=category_to_one_hot(train_data_df,NON_FEATURE,CONT)
print(end-start)

Done filtering columns...
-0.0682888031006


In [38]:
def get_file_path(directory, filename):
    """ Combines file path directory/filename
    """
    return os.path.join(directory, filename)

In [39]:
def write_out(df, output):
    df.set_index(['activity_id']).to_csv(output + "_features_reduced_17304_non_cont.csv")

In [40]:
import os
file_save=get_file_path(SAVE_AS_DIR,'train')
write_out(train_data_df,file_save)

In [41]:
file_save=get_file_path(SAVE_AS_DIR,'test')
write_out(test_data_df,file_save)

In [24]:
file_save=get_file_path(SAVE_AS_DIR,'act_the_group')
the_group['outcome']=0
write_out(the_group,file_save)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [17]:
# Delete redundant values
v_out=train_output['outcome'].as_matrix()
del train_output
del train_data_df

In [None]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_arr,v_out, test_size=0.4, random_state=4)