In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
# Path to people.csv from ReadHatKaggle data set
FEATURE_FILE ='kaggle-predicting-Red-Hat-Business-Value/Data/train_features_reduced.csv'
# Path to act_train.csv from RedHatKaggle data set
OUTPUT ='kaggle-predicting-Red-Hat-Business-Value/Data/act_train_output.csv'
# Path to the test file
TEST_FILE = 'kaggle-predicting-Red-Hat-Business-Value/Data/test_features_reduced.csv'

# Path to the files with reduced dimesions
SAVE_AS_DIR = 'kaggle-predicting-Red-Hat-Business-Value/Data'

In [3]:
# Non feature
NON_FEATURE=['activity_id','people_id','date','people_date']

# Categorical data that is only label encoded
CATEGORICAL_DATA = ['people_char_1', 'people_char_2','people_group_1',
                    'people_char_3', 'people_char_4', 'people_char_5',
                    'people_char_6', 'people_char_7', 'people_char_8',
                    'people_char_9', 'activity_category',
                    'char_1', 'char_2', 'char_3', 'char_4', 'char_5', 'char_6',
                    'char_7', 'char_8', 'char_9', 'char_10']

# Already in a one-hot encoded form
CATEGORICAL_BINARY = ['people_char_10', 'people_char_11', 'people_char_12',
                      'people_char_13', 'people_char_14', 'people_char_15',
                      'people_char_16', 'people_char_17', 'people_char_18',
                      'people_char_19', 'people_char_20', 'people_char_21',
                      'people_char_22', 'people_char_23', 'people_char_24',
                      'people_char_25', 'people_char_26', 'people_char_27',
                      'people_char_28', 'people_char_29', 'people_char_30',
                      'people_char_31', 'people_char_32', 'people_char_33',
                      'people_char_34', 'people_char_35', 'people_char_36',
                      'people_char_37' ]

# Continuous categories
CONT = ['people_days', 'days',
      'people_month',  'month', 
      'people_quarter', 'quarter',
      'people_week', 'week',
      'people_dayOfMonth', 'dayOfMonth',
      'people_year', 'year',
      'people_char_38', ]

In [4]:
# Function to change labels of categories to one-hot encoding using scikit's OneHot Encoding
# pd.get_dummies(df) does the same, provides sweet header's as well but it it not fast enough, kill's memory
def category_to_one_hot(dataset,non_feature,continuous_feature):
    ds = dataset.drop(non_feature,axis=1)
    boolean_column = []
    counter=0
    for column in ds.columns:
        if column not in continuous_feature:
            boolean_column.append(counter)
        counter += 1
    # boolean_colum is not the column name but index
    print ("Done filtering columns...")
    grd_enc = OneHotEncoder(categorical_features=boolean_column)
    encoded_arr=grd_enc.fit_transform(ds)
    return encoded_arr

In [5]:
# Read the train data set
train_data_df=pd.read_csv(FEATURE_FILE,parse_dates=["date"])
train_data_df.sort_values(by=['activity_id'],ascending=True, inplace=True)

In [6]:
# Read the train data output
train_output = pd.read_csv(OUTPUT)
train_output.sort_values(by='activity_id',ascending=True, inplace=True)

In [7]:
# Delete redundant values
v_out=train_output['outcome']

In [8]:
test_data_df = pd.read_csv(TEST_FILE, parse_dates=["date"])
test_data_df.sort_values(by=['activity_id'], ascending=True, inplace=True)

In [None]:
# test_data_df.set_index('activity_id').drop('act_0')

In [None]:
# people_char_3, char_1, char_2, char_5

In [10]:
# Function to one hot encode all values
start=time.time()
arr=category_to_one_hot(train_data_df,NON_FEATURE,CONT)
end=time.time()
print(end-start)

start=time.time()
arr_b=category_to_one_hot(test_data_df,NON_FEATURE,CONT)
end=time.time()
print(end-start)

Done filtering columns...
99.5136339664
Done filtering columns...
13.9581620693


In [11]:
print (arr.shape)
print (arr_b.shape)

(2197291, 11196)
(498688, 11196)


In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
start=time.time()
norm = StandardScaler(with_mean=False, with_std=True)
norm.fit(arr)
train_arr_n=norm.transform(arr)
end=time.time()
print(end-start)

80.417935133


In [17]:
start=time.time()
norm = StandardScaler(with_mean=False, with_std=True)
norm.fit(arr_b)
test_arr_n=norm.transform(arr_b)
end=time.time()
print(end-start)

17.1580791473


In [18]:
train_arr_n.shape

(2197291, 11196)

In [19]:
test_arr_n.shape

(498688, 11196)

In [21]:
# instantiate a logistic regression model, and fit with X and y
start=time.time()
model = LogisticRegression()
model = model.fit(train_arr_n, v_out)
end=time.time()
print(end-start)

134.291324139


In [25]:
start=time.time()
y_pred = model.predict_proba(test_arr_n)
end=time.time()
print(end-start)

2.50886893272


In [26]:
test_data_df['outcome']=y_pred[:,1]

In [27]:
test_data_df[['outcome','activity_id']].set_index('activity_id').drop('act_0').to_csv("LRresults.csv")

In [None]:
print (time.time())