In [None]:
# XGBoost Model for RedHat


In [None]:
import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import random
from operator import itemgetter
import time
import copy

random.seed(2016)

#creating feature map
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


In [None]:
def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance


In [None]:
def intersect(a, b):
    return list(set(a) & set(b))


In [None]:
def run_single(train, test, features, target, random_state=0):
    eta = 0.2
    max_depth = 5
    subsample = 0.8
    colsample_bytree = 0.8
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "tree_method": 'exact',
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state,
    }
    num_boost_round = 115
    early_stopping_rounds = 10
    test_size = 0.1

    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    print('Length train:', len(X_train.index))
    print('Length valid:', len(X_valid.index))
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid[features]))
    score = roc_auc_score(X_valid[target].values, check)
    print('ROC auc score: {:.6f}'.format(score))

    imp = get_importance(gbm, features)
    print('Importance array: ', imp)

    print("Predict test set...")
    test_prediction_probs = gbm.predict(xgb.DMatrix(test[features]))

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction_probs.tolist(), score




In [None]:
#Running Kfold
def run_kfold(nfolds, train, test, features, target, random_state=0):
    eta = 0.1
    max_depth = 5
    subsample = 0.8
    colsample_bytree = 0.8
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state
    }
    num_boost_round = 50
    early_stopping_rounds = 10

    yfull_train = dict()
    yfull_test = copy.deepcopy(test[['activity_id']].astype(object))
    kf = KFold(len(train.index), n_folds=nfolds, shuffle=True, random_state=random_state)
    num_fold = 0
    for train_index, test_index in kf:
        num_fold += 1
        print('Start fold {} from {}'.format(num_fold, nfolds))
        X_train, X_valid = train[features].as_matrix()[train_index], train[features].as_matrix()[test_index]
        y_train, y_valid = train[target].as_matrix()[train_index], train[target].as_matrix()[test_index]
        X_test = test[features].as_matrix()

        print('Length train:', len(X_train))
        print('Length valid:', len(X_valid))

        dtrain = xgb.DMatrix(X_train, y_train)
        dvalid = xgb.DMatrix(X_valid, y_valid)

        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)
        
        print("Validating...")
        yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
        score = roc_auc_score(y_valid.tolist(), yhat)
        print('Check error value: {:.6f}'.format(score))

        # Each time store portion of precicted data in train predicted values
        for i in range(len(test_index)):
            yfull_train[test_index[i]] = yhat[i]

        imp = get_importance(gbm, features)
        print('Importance array: ', imp)

        print("Predict test set...")
        test_prediction = gbm.predict(xgb.DMatrix(X_test), ntree_limit=gbm.best_iteration+1)
        yfull_test['kfold_' + str(num_fold)] = test_prediction

    # Copy dict to list
    train_res = []
    for i in sorted(yfull_train.keys()):
        train_res.append(yfull_train[i])

    score = roc_auc_score(train[target], np.array(train_res))
    print('Check error value: {:.6f}'.format(score))

    #mean for KFolds on test
    merge = []
    for i in range(1, nfolds+1):
        merge.append('kfold_' + str(i))
    yfull_test['mean'] = yfull_test[merge].mean(axis=1)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return yfull_test['mean'].values, score


In [None]:
#Creating output table
def output_table(score, test, prediction):
    now = datetime.datetime.now()
    sub_file = 'output_table_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing Output Table: ', sub_file)
    f = open(sub_file, 'w')
    f.write('activity_id,outcome\n')
    total = 0
    for id in test['activity_id']:
        str1 = str(id) + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()


In [None]:
def get_features(train, test):
    trainval = list(train.columns.values)
    testval = list(test.columns.values)
    output = intersect(trainval, testval)
    output.remove('people_id')
    output.remove('activity_id')
    return sorted(output)


In [None]:
#Reading datasets
def read_train_test():

    print("Reading people.csv...")
    people = pd.read_csv("D:/MSBA/Machine Learning/Report 1/people.csv/people.csv",
                       dtype={'people_id': str,
                              'activity_id': str,
                              'char_38': np.int32},
                       parse_dates=['date'])

    print("Loading train.csv...")
    train = pd.read_csv("D:/MSBA/Machine Learning/Report 1/act_train.csv/act_train.csv",
                        dtype={'people_id': str,
                               'activity_id': str,
                               'outcome': np.int8},
                        parse_dates=['date'])

    print("Loading test.csv...")
    test = pd.read_csv("D:/MSBA/Machine Learning/Report 1/act_test.csv/act_test.csv",
                       dtype={'people_id': str,
                              'activity_id': str},
                       parse_dates=['date'])

    print("Processing tables...")
    for table in [train, test]:
        table['year'] = table['date'].dt.year
        table['month'] = table['date'].dt.month
        table['day'] = table['date'].dt.day
        table.drop('date', axis=1, inplace=True)
        table['activity_category'] = table['activity_category'].str.lstrip('type ').astype(np.int32)
        for i in range(1, 11):
            table['char_' + str(i)].fillna('type -999', inplace=True)
            table['char_' + str(i)] = table['char_' + str(i)].str.lstrip('type ').astype(np.int32)

    people['year'] = people['date'].dt.year
    people['month'] = people['date'].dt.month
    people['day'] = people['date'].dt.day
    people.drop('date', axis=1, inplace=True)
    people['group_1'] = people['group_1'].str.lstrip('group ').astype(np.int32)
    for i in range(1, 10):
        people['char_' + str(i)] = people['char_' + str(i)].str.lstrip('type ').astype(np.int32)
    for i in range(10, 38):
        people['char_' + str(i)] = people['char_' + str(i)].astype(np.int32)

    print("Merging...")
    train = pd.merge(train, people, how='left', on='people_id')
    test = pd.merge(test, people, how='left', on='people_id')
    train.fillna(-999, inplace=True)
    test.fillna(-999, inplace=True)

    features = get_features(train, test)
    return train, test, features

In [None]:
#Calling all the function
train, test, features = read_train_test()
print('Length of train: ', len(train))
print('Length of test: ', len(test))
print('Features [{}]: {}'.format(len(features), sorted(features)))

test_prediction, score = run_single(train, test, features, 'outcome')

output_table(score, test, test_prediction)


In [None]:
## Logistic Regression for RedHat

In [None]:
from sklearn.linear_model import LogisticRegression
#Leave One Out as a function
def LeaveOneOut(data1, data2, columnName, useLOO=False):
    grpOutcomes = data1.groupby(columnName).mean().reset_index()
    outcomes = data2['outcome'].values
    x = pd.merge(
    data2[[columnName, 'outcome']],
    grpOutcomes,
    suffixes=('x_', ''),
    how='left',
    on=columnName)['outcome']

    if(useLOO):
        x = ((x*x.shape[0])-outcomes)/(x.shape[0]-1)
    return x.fillna(x.mean())


In [None]:
#Main Function
def main():
    directory = 'D:/MSBA/Machine Learning/Report 1/'
    train_L = pd.read_csv(directory+'act_train.csv/act_train.csv',
                        usecols=['people_id', 'outcome'])
    
    test_L = pd.read_csv(directory+'act_test.csv/act_test.csv',
                       usecols=['activity_id', 'people_id'])
    
    people_L = pd.read_csv(directory+'/people.csv/people.csv',
                         usecols=['people_id',
                                  'group_1',
                                  'char_2',
                                  'char_38'])
    
    train_L = pd.merge(train_L, people_L, how='left', on='people_id')

    train_L.fillna('-999', inplace=True)
    lootrain = pd.DataFrame()
    for col in train_L.columns:
        if(col != 'outcome' and col != 'people_id'):
            print(col)
            lootrain[col] = LeaveOneOut(train_L, train_L, col, True).values
    lr = LogisticRegression(C=100000.0)
    lr.fit(lootrain[['group_1', 'char_2', 'char_38']], train_L['outcome'])
    preds = lr.predict_proba(lootrain[['group_1', 'char_2', 'char_38']])[:, 1]
    print('roc', roc_auc_score(train_L.outcome, preds))
    
    test_L = pd.read_csv(directory+'act_test.csv/act_test.csv',
                       usecols=['activity_id', 'people_id'])
    
    test_L = pd.merge(test_L, people_L, how='left', on='people_id')
    
    test_L.fillna('-999', inplace=True)
    activity_id = test_L.activity_id.values
    test_L.drop('activity_id', inplace=True, axis=1)
    test_L['outcome'] = 0
    lootest = pd.DataFrame()
    for col in train_L.columns:
        if(col != 'outcome' and col != 'people_id'):
            print(col)
            lootest[col] = LeaveOneOut(train_L, test_L, col, False).values
    preds = lr.predict_proba(lootest[['group_1', 'char_2', 'char_38']])[:, 1]
    Logistic_Table = pd.DataFrame()
    Logistic_Table['activity_id'] = activity_id
    Logistic_Table['outcome'] = preds
    Logistic_Table.to_csv('Logistic_Output_Table.csv', index=False, float_format='%.3f')


if __name__ == "__main__":
    print('Starting')
    main()
    print('Finished')

In [None]:
### Random Forest

In [None]:
import xgboost as xgb
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPClassifier
from sklearn import linear_model
from sklearn import model_selection

print("Loading Train data...")
train = pd.read_csv('D:/MSBA/Machine Learning/Report 1/act_train.csv/act_train.csv', dtype={'people_id': str, 'activity_id': str, 'outcome': np.int8}, parse_dates=['date'])


Y = train['outcome']

train.drop('activity_id',axis=1,inplace=True)
train.drop('date',axis=1,inplace=True)
train.drop('outcome',axis=1,inplace=True)

print("Loading Test data...")
test = pd.read_csv('D:/MSBA/Machine Learning/Report 1/act_test.csv/act_test.csv', dtype={'people_id': str, 'activity_id': str}, parse_dates=['date'])

act_id = test['activity_id']

test.drop('activity_id',axis=1,inplace=True)
test.drop('date',axis=1,inplace=True)

print("Loading people data...")
people = pd.read_csv('D:/MSBA/Machine Learning/Report 1/people.csv/people.csv', dtype={'people_id': str, 'activity_id': str, 'char_38': np.int32}, parse_dates=['date'])

train['activity_category'] = train['activity_category'].str.lstrip('type ').astype(np.int32)
test['activity_category'] = test['activity_category'].str.lstrip('type ').astype(np.int32)

for i in range(1,11):
    charMax = train['char_'+str(i)].value_counts().idxmax()
    train['char_'+str(i)].fillna(charMax, inplace=True)
    train['char_'+str(i)] = train['char_'+str(i)].str.lstrip('type ').astype(np.int32)
    test['char_'+str(i)].fillna(charMax, inplace=True)
    test['char_'+str(i)] = test['char_'+str(i)].str.lstrip('type ').astype(np.int32)

people['group_1'] = people['group_1'].str.lstrip('group ').astype(np.int32)
people.drop('date',axis=1,inplace=True)

for i in range(1, 10):
    people['char_' + str(i)] = people['char_' + str(i)].str.lstrip('type ').astype(np.int32)
for i in range(10, 38):
    people['char_' + str(i)] = people['char_' + str(i)].astype(np.int32)

print("Merging data...")
train = pd.merge(train, people, how='left', on='people_id')
test = pd.merge(test, people, how='left', on='people_id')

del people

train.drop('people_id',axis=1,inplace=True)
test.drop('people_id',axis=1,inplace=True)

X = train.values
testX = test.values

print("Running Random Forest")
dt_regressor = DecisionTreeRegressor()
rf_regressor200 = RandomForestClassifier(n_estimators = 200)
log_regressor = linear_model.LogisticRegression()
mlp_regressor = MLPClassifier(activation='logistic')


rf_regressor200.fit(X,Y)
predict = rf_regressor200.predict(testX)

wrt = True
if wrt:
    output_rfc = open('Output_rfc.csv','w')
    output_rfc.write('activity_id,outcome\n')
    for i in range(0,len(predict)):
        output_rfc.write(str(act_id[i])+","+str(predict[i])+"\n")
    output_rfc.flush()
    print("Finished...")

In [None]:
#ROC Curve Graph
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.model_selection import train_test_split


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# predict probabilities for the test set
predict_probs = rf_regressor200.predict_proba(X_test)[:, 1]

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(Y_test, predict_probs)

# Compute ROC AUC score
roc_auc = roc_auc_score(Y_test, predict_probs)
print(f"ROC AUC Score: {roc_auc}")

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
