In [39]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [40]:
# Path to people.csv from ReadHatKaggle data set
PEOPLE_FILE_PATH='/Users/sanjanajayakumar/Desktop/Predicting Red Hat Business Value/people.csv'
# Path to act_train.csv from RedHatKaggle data set
ACTIVITIES_FILE_PATH='/Users/sanjanajayakumar/Desktop/Predicting Red Hat Business Value/act_train.csv'
# Path to test.csv from RedHatKaggle data set
TEST_DATA_FILE_PATH='/Users/sanjanajayakumar/Desktop/Predicting Red Hat Business Value/act_test.csv'

# For enabling one hot key encoding
ONE_HOT_KEY = False

# Columns that do not represent features
# TODO: date is not a NON-FEATURE, the preprocessing for date has not been done
NON_FEATURE=['people_people_id','people_id','activity_id','outcome','people_date','date']

In [41]:
def category_to_label_encoding(dataset,identity_columns=['people_people_id','people_id','activity_id']):
    for column in dataset.columns:
        if column not in identity_columns:
            if (dataset[column].dtype == 'O'):
                dataset[column]=dataset[column].apply(lambda x: str(x).split(' ')[1]).astype(np.int32)
              # This converts bool to int, but scikit treats boolean as int
#           elif dataset[column].dtype == 'bool':
#                 le=LabelEncoder()
#                 le.fit(['True','False'])
#                 dataset[column]=le.transform(dataset[column])
    return dataset

In [42]:
# Function to change labels of categories to one-hot encoding using scikit's OneHot Encoding
# pd.get_dummies(df) does the same, provides sweet header's as well but it it not fast enough, kill's memory
def category_to_one_hot(dataset,non_feature=NON_FEATURE):
    boolean_column = []
#     column_names = []
    counter=0
    for column in dataset.columns:
        if column not in NON_FEATURE:
            if dataset[column].dtype == 'bool':
                counter += 1
                continue
            if dataset[column].dtype == '<M8[ns]':
                counter += 1
                continue
            boolean_column.append(counter)
#             column_names.append(column)# Read the data set's
            counter += 1
    ds = dataset.drop(NON_FEATURE, axis=1)
    grd_enc = OneHotEncoder(categorical_features=boolean_column)
    encoded_arr=grd_enc.fit_transform(ds).toarray()
    return encoded_arr,grd_enc

In [43]:
# Read the data set's
people_df=pd.read_csv(PEOPLE_FILE_PATH,parse_dates=["date"],true_values=["True"],false_values=["False"])
activity_df=pd.read_csv(ACTIVITIES_FILE_PATH, parse_dates=["date"])
test_df=pd.read_csv(TEST_DATA_FILE_PATH,parse_dates=["date"])

In [44]:
people_df

Unnamed: 0,people_id,char_1,group_1,char_2,date,char_3,char_4,char_5,char_6,char_7,...,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,...,False,True,True,False,False,True,True,True,False,36
1,ppl_100002,type 2,group 8688,type 3,2021-01-06,type 28,type 9,type 5,type 3,type 11,...,False,True,True,True,True,True,True,True,False,76
2,ppl_100003,type 2,group 33592,type 3,2022-06-10,type 4,type 8,type 5,type 2,type 5,...,False,False,True,True,True,True,False,True,True,99
3,ppl_100004,type 2,group 22593,type 3,2022-07-20,type 40,type 25,type 9,type 4,type 16,...,True,True,True,True,True,True,True,True,True,76
4,ppl_100006,type 2,group 6534,type 3,2022-07-27,type 40,type 25,type 9,type 3,type 8,...,False,False,True,False,False,False,True,True,False,84
5,ppl_10001,type 2,group 25417,type 3,2022-10-14,type 6,type 6,type 4,type 1,type 1,...,False,True,True,True,True,True,True,True,True,90
6,ppl_100010,type 2,group 17304,type 2,2022-09-01,type 8,type 7,type 8,type 1,type 7,...,False,False,False,False,False,False,False,False,False,2
7,ppl_100013,type 2,group 4204,type 3,2023-01-24,type 4,type 8,type 4,type 1,type 7,...,False,False,True,True,True,True,False,True,True,91
8,ppl_100019,type 2,group 45749,type 3,2023-03-26,type 40,type 25,type 9,type 3,type 9,...,False,False,False,False,False,False,False,False,False,84
9,ppl_100025,type 2,group 36096,type 3,2022-08-26,type 14,type 6,type 8,type 3,type 9,...,False,False,False,False,False,False,False,False,False,76


In [45]:
# Introduce a category for null values called category 0 since scikit needs numeric data
people_df.fillna("type 0", inplace=True)
activity_df.fillna("type 0", inplace=True)
test_df.fillna("type 0", inplace=True)

In [46]:
days = {0:'Mon',1:'Tues',2:'Weds',3:'Thurs',4:'Fri',5:'Sat',6:'Sun'}
months = {0: 'Jan', 1: 'Feb', 2: 'Mar', 3: "Apr", 4: "May", 5: "Jun", 6: "Jul", 7: "Aug", 8: "Sep", 9: "Oct", 10: "Nov", 11: "Dec"}

In [47]:
people_df['days'] = people_df['date'].dt.dayofweek#.apply(lambda x: days[x])
people_df['month'] = people_df['date'].dt.month#.apply(lambda x: months[x-1])
people_df['quarter'] = people_df['date'].dt.quarter
people_df['year'] = people_df['date'].dt.year
people_df['week'] = people_df['date'].dt.week
people_df['dayOfYear'] = people_df['date'].dt.day
#del people_df['date']

test_df['days'] = test_df['date'].dt.dayofweek#.apply(lambda x: days[x])
test_df['month'] = test_df['date'].dt.month#.apply(lambda x: months[x-1])
test_df['quarter'] = test_df['date'].dt.quarter
test_df['year'] = test_df['date'].dt.year
test_df['week'] = test_df['date'].dt.week
test_df['dayOfYear'] = test_df['date'].dt.day
#del test_df['date']

activity_df['days'] = activity_df['date'].dt.dayofweek#.apply(lambda x: days[x])
activity_df['month'] = activity_df['date'].dt.month#.apply(lambda x: months[x-1])
activity_df['quarter'] = activity_df['date'].dt.quarter
activity_df['year'] = activity_df['date'].dt.year
activity_df['week'] = activity_df['date'].dt.week
activity_df['dayOfYear'] = activity_df['date'].dt.day
#del activity_df['date']

In [48]:
people_df.columns

Index([u'people_id', u'char_1', u'group_1', u'char_2', u'date', u'char_3',
       u'char_4', u'char_5', u'char_6', u'char_7', u'char_8', u'char_9',
       u'char_10', u'char_11', u'char_12', u'char_13', u'char_14', u'char_15',
       u'char_16', u'char_17', u'char_18', u'char_19', u'char_20', u'char_21',
       u'char_22', u'char_23', u'char_24', u'char_25', u'char_26', u'char_27',
       u'char_28', u'char_29', u'char_30', u'char_31', u'char_32', u'char_33',
       u'char_34', u'char_35', u'char_36', u'char_37', u'char_38', u'days',
       u'month', u'quarter', u'year', u'week', u'dayOfYear'],
      dtype='object')

In [49]:
# Rename columns under people_df to avoid same-names in the three dataframe
people_df = people_df.rename(columns=lambda x : "people_" + x)
#test_df = test_df.rename(columns=lambda x : "test_" + x)

In [50]:
# Merge activity and test data frame with people_df
train_dataset = pd.merge(people_df, activity_df, how='right', left_on='people_people_id', right_on="people_id")
test_dataset = pd.merge(people_df, test_df, how='right', left_on='people_people_id', right_on="people_id")

In [51]:
# clearing memory
del people_df
del activity_df
del test_df

# Not using the official label encoder because it is beneficial to have type 7 -> 7
# The label encoder might assign a value '6' to type 7 based on the order in which it appears in the list
# Sorting might improve it but sometimes values are missing from the type
# DOUBT: not sure if continuous encoding values are a requirement ?
train_dataset = category_to_label_encoding(train_dataset)
test_dataset = category_to_label_encoding(test_dataset)

In [52]:
train_dataset['people_people_id']=train_dataset['people_people_id'].apply(lambda x: str(x).split('_')[1]).astype(np.float32)
test_dataset['people_people_id']=test_dataset['people_people_id'].apply(lambda x: str(x).split('_')[1]).astype(np.float32)
test_dataset['people_id']=test_dataset['people_id'].apply(lambda x: str(x).split('_')[1]).astype(np.float32)
train_dataset['people_id']=train_dataset['people_id'].apply(lambda x: str(x).split('_')[1]).astype(np.float32)
test_dataset['activity_id_num']=test_dataset['activity_id'].apply(lambda x: str(x).split('_')[1]).astype(np.float32)
train_dataset['activity_id_num']=train_dataset['activity_id'].apply(lambda x: str(x).split('_')[1]).astype(np.float32)

In [59]:
activity_id

0          act2_1734928
1          act2_2434093
2          act2_3404049
3          act2_3651215
4          act2_4109017
5           act2_898576
6          act2_1233489
7          act2_1623405
8          act2_1111598
9          act2_1177453
10          act2_133509
11         act2_1408475
12         act2_1610829
13         act2_1688731
14         act2_1961720
15         act2_1988351
16         act2_2198176
17         act2_2437661
18         act2_2476911
19         act2_2557270
20         act2_2581989
21         act2_3116385
22         act2_3159712
23         act2_3292005
24         act2_3461867
25         act2_3468306
26         act2_3867292
27         act2_3990676
28         act2_4102292
29         act2_4160587
               ...     
2197261    act2_2053669
2197262    act2_2053762
2197263    act2_2134893
2197264    act2_2139237
2197265    act2_2233766
2197266    act2_2499779
2197267    act2_2514377
2197268    act2_2537795
2197269    act2_2582736
2197270    act2_2674371
2197271    act2_

In [63]:
train_dataset.columns

Index([u'people_people_id', u'people_char_1', u'people_group_1',
       u'people_char_2', u'people_date', u'people_char_3', u'people_char_4',
       u'people_char_5', u'people_char_6', u'people_char_7', u'people_char_8',
       u'people_char_9', u'people_char_10', u'people_char_11',
       u'people_char_12', u'people_char_13', u'people_char_14',
       u'people_char_15', u'people_char_16', u'people_char_17',
       u'people_char_18', u'people_char_19', u'people_char_20',
       u'people_char_21', u'people_char_22', u'people_char_23',
       u'people_char_24', u'people_char_25', u'people_char_26',
       u'people_char_27', u'people_char_28', u'people_char_29',
       u'people_char_30', u'people_char_31', u'people_char_32',
       u'people_char_33', u'people_char_34', u'people_char_35',
       u'people_char_36', u'people_char_37', u'people_char_38', u'people_days',
       u'people_month', u'people_quarter', u'people_year', u'people_week',
       u'people_dayOfYear', u'people_id', u'date'

In [66]:
activity_id = train_dataset["activity_id"]
#del train_dataset["activity_id"]
#train_dataset['activity_id'] = activity_id
#train_dataset


In [67]:
del train_dataset['date']
del train_dataset['people_date']
del test_dataset['date']
del test_dataset['people_date']

In [68]:
y = train_dataset['outcome']

In [69]:
del train_dataset['outcome']

In [70]:
# flatten y into a 1-D array
y = np.ravel(y)

In [71]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(train_dataset, y)

In [72]:
model.score(train_dataset, y)

0.81963517804423724

In [23]:
# examine the coefficients
pd.DataFrame(zip(train_dataset.columns, np.transpose(model.coef_))).sort

Unnamed: 0,0,1
0,people_people_id,[-1.04073383909e-07]
1,people_char_1,[-3.23636700399e-05]
2,people_group_1,[-3.22015743899e-05]
3,people_char_2,[0.000461790618571]
4,people_char_3,[-0.00290640138852]
5,people_char_4,[-0.00138551956726]
6,people_char_5,[-0.000273403273971]
7,people_char_6,[-0.000574145860077]
8,people_char_7,[-0.00185595886133]
9,people_char_8,[-6.27696303066e-05]


In [73]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(train_dataset, y, test_size=0.3, random_state=0)
model2 = LogisticRegression()
model2.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
# predict class labels for the test set
predicted = model2.predict(X_test)
print predicted

[0 0 0 ..., 1 1 1]


In [26]:
# generate class probabilities
probs = model2.predict_proba(X_test)
print probs

[[ 0.57245135  0.42754865]
 [ 0.90012057  0.09987943]
 [ 0.94795682  0.05204318]
 ..., 
 [ 0.17943351  0.82056649]
 [ 0.43823449  0.56176551]
 [ 0.12453494  0.87546506]]


In [27]:
# generate evaluation metrics
print metrics.accuracy_score(y_test, predicted)
print metrics.roc_auc_score(y_test, probs[:, 1])

0.819790408806
0.888895486227


In [28]:
print metrics.confusion_matrix(y_test, predicted)
print metrics.classification_report(y_test, predicted)

[[293471  73072]
 [ 45720 246925]]
             precision    recall  f1-score   support

          0       0.87      0.80      0.83    366543
          1       0.77      0.84      0.81    292645

avg / total       0.82      0.82      0.82    659188



In [30]:
predicted = model2.predict(test_dataset)
print predicted

[1 1 1 ..., 0 0 0]


In [31]:
predicted = model.predict(X_test)
print predicted

[0 0 0 ..., 1 1 1]


In [32]:
predicted = model2.predict(test_dataset)
print predicted

[1 1 1 ..., 0 0 0]


In [76]:
activity_id_test = test_dataset["activity_id"]
del test_dataset["activity_id"]
#test_dataset['activity_id'] = activity_id_test
#test_dataset

In [77]:
# generate class probabilities
probs = model.predict_proba(test_dataset)
print probs

[[ 0.40780617  0.59219383]
 [ 0.40932655  0.59067345]
 [ 0.25329046  0.74670954]
 ..., 
 [ 0.76089749  0.23910251]
 [ 0.76089505  0.23910495]
 [ 0.76492262  0.23507738]]


In [78]:
test_dataset['activity_id'] = activity_id_test
test_dataset

Unnamed: 0,people_people_id,people_char_1,people_group_1,people_char_2,people_char_3,people_char_4,people_char_5,people_char_6,people_char_7,people_char_8,...,char_9,char_10,days,month,quarter,year,week,dayOfYear,activity_id_num,activity_id
0,100004.0,2,22593,3,40,25,9,4,16,2,...,4,0,2,7,3,2022,29,20,249281.0,act1_249281
1,100004.0,2,22593,3,40,25,9,4,16,2,...,0,682,2,7,3,2022,29,20,230855.0,act2_230855
2,10001.0,2,25417,3,6,6,4,1,1,2,...,10,0,4,10,4,2022,41,14,240724.0,act1_240724
3,10001.0,2,25417,3,6,6,4,1,1,2,...,5,0,6,11,4,2022,47,27,83552.0,act1_83552
4,10001.0,2,25417,3,6,6,4,1,1,2,...,0,3015,5,10,4,2022,41,15,1043301.0,act2_1043301
5,10001.0,2,25417,3,6,6,4,1,1,2,...,0,4987,6,11,4,2022,47,27,112890.0,act2_112890
6,10001.0,2,25417,3,6,6,4,1,1,2,...,0,3015,5,10,4,2022,41,15,1169930.0,act2_1169930
7,10001.0,2,25417,3,6,6,4,1,1,2,...,0,3015,5,10,4,2022,41,15,1924448.0,act2_1924448
8,10001.0,2,25417,3,6,6,4,1,1,2,...,0,3015,5,10,4,2022,41,15,1953554.0,act2_1953554
9,10001.0,2,25417,3,6,6,4,1,1,2,...,0,3015,0,11,4,2022,48,28,1971739.0,act2_1971739


In [None]:
train_dataset['activity_id']=train_dataset['activity_id'].apply(lambda x: "activity_id").astype(np.float32)
people_df = people_df.rename(columns=lambda x : "people_" + x)

In [79]:
test_dataset['outcome'] = probs[:,1]

In [80]:
test_dataset[['activity_id','outcome']].set_index(['activity_id']).to_csv('../results.csv')