In [1]:
import numpy as np
import pandas as pd

# preprocessing
from sklearn.cross_validation import train_test_split # to divide train and test set
from sklearn import preprocessing # for feature scaling

# feature selection
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold

# import linear model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# model evaluation
from sklearn import cross_validation
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [2]:
cd Dropbox/Portfolio/DataScience-Portfolio/KDD-1998

/Users/Capgemini/Dropbox/Portfolio/DataScience-Portfolio/KDD-1998


### Load data and separate in Classifier and Labels

In [3]:
# load data
kdd = pd.read_csv('data_class2.csv')

# generate X and Y for preditions
Y = np.ravel(kdd.TARGET_B)  # to flatten array
X = kdd.drop('TARGET_B', axis = 1)

In [4]:
X.shape

(95149, 1913)

In [5]:
Y.mean() # very unbalanced class!!!

0.050751978475864171

### Functions for feature scaling

In [6]:
# Feature scaling - normalisation
def standarisation(train, test):
    scaler = preprocessing.StandardScaler().fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)
    return train, test

# Feature scaling - MinMax Scaler (scales between 0 and 1)
def minMax_standarisation(train, test):
    scaler = preprocessing.MinMaxScaler().fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)
    return train, test

### Function for feature selection

In [7]:
# feature selection function
def feat_select(model, xtrain, test, ytrain):
    selector = model
    selector.fit(xtrain, ytrain)
    X_train_new = selector.transform(xtrain)
    X_test_new = selector.transform(test)
    return X_train_new, X_test_new

### Function for Logistic Regression

In [8]:
def logReg_mod(Xtrain, Ytrain, Xtest, Ytest):
    logit = LogisticRegression()

    # Train the model using the training sets
    logit.fit(Xtrain, Ytrain)
    predicted = logit.predict(Xtest)
    
    print("Train set")
    print("Accuracy: %.3f" % metrics.accuracy_score(Ytrain, logit.predict(Xtrain)))
    print('CrossVal: %.3f' % cross_validation.cross_val_score(logit, Xtrain, Ytrain, cv=5).mean())
    print('=================')
    print("Test set")
    print("Accuracy: %.3f" % metrics.accuracy_score(Ytest, predicted))
    print("Auc: %.3f" % roc_auc_score(Ytest, predicted))
    return logit

### Function to create Random Forest classifier

In [9]:
def rf_mod(Xtrain, Ytrain, Xtest, Ytest):
    logit = RandomForestClassifier(n_estimators = 500, random_state = 1)

    # Train the model using the training sets
    logit.fit(Xtrain, Ytrain)
    predicted = logit.predict(Xtest)
    
    print("Train set")
    print("Accuracy: %.3f" % metrics.accuracy_score(Ytrain, logit.predict(Xtrain)))
    print('CrossVal: %.3f' % cross_validation.cross_val_score(logit, Xtrain, Ytrain, cv=5).mean())
    print('=================')
    print("Test set")
    print("Accuracy: %.3f" % metrics.accuracy_score(Ytest, predicted))
    print("Auc: %.3f" % roc_auc_score(Ytest, predicted))
    return logit

### Function for confusion matrix

In [10]:
def confusion_mtx(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    cm = pd.DataFrame(data=cm, columns=[0, 1], index=[0, 1])
    cm.columns.name = 'Predicted label'
    cm.index.name = 'True label'
    error_rate = (y_pred != y_test).mean()
    print('error rate: %.3f' % error_rate)
    return cm

### Function to split train and test set and normalise predictors

In [11]:
# function to split test and train and normalise
def split_standarise(X,Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.9, random_state=42)
    X_train, X_test = standarisation(X_train, X_test)
    return X_train, X_test, Y_train, Y_test

### First logistic regression model

In [12]:
# separate testing and training set + normalise
X_train, X_test, Y_train, Y_test = split_standarise(X,Y)
X_train.shape, X_test.shape

((9514, 1913), (85635, 1913))

In [13]:
# Remove Zero Variance features
X_train, X_test = feat_select(VarianceThreshold(), X_train, X_test, Y_train)
X_train.shape

(9514, 1763)

In [14]:
# select top 10% features
X_train, X_test = feat_select(SelectPercentile(f_classif, percentile = 10), X_train, X_test, Y_train)
X_train.shape, X_test.shape

((9514, 177), (85635, 177))

In [15]:
# run first log reg model with 234 features
logit = logReg_mod(X_train, Y_train, X_test, Y_test)

Train set
Accuracy: 0.953
CrossVal: 0.950
Test set
Accuracy: 0.947
Auc: 0.500


### Second Log Reg: recursive feature elimination

In [16]:
# Recursive Feature Selection (RFS)
logit_RFS = SelectFromModel(logit, prefit=True)
X_train = logit_RFS.transform(X_train)
X_test = logit_RFS.transform(X_test)

X_train.shape

(9514, 59)

In [17]:
logit = logReg_mod(X_train, Y_train, X_test, Y_test)

Train set
Accuracy: 0.953
CrossVal: 0.952
Test set
Accuracy: 0.948
Auc: 0.500



### Third logitstic regression model: select K best features

In [18]:
# separate train and test and normalise
X_train, X_test, Y_train, Y_test = split_standarise(X,Y)
X_train, X_test = feat_select(VarianceThreshold(), X_train, X_test, Y_train)

# select 10 best features
X_train, X_test = feat_select(SelectKBest(f_classif, k=10), X_train, X_test, Y_train)
X_train.shape

(9514, 10)

In [19]:
# run second log reg model with 10 features
logit = logReg_mod(X_train, Y_train, X_test, Y_test)

Train set
Accuracy: 0.952
CrossVal: 0.952
Test set
Accuracy: 0.949
Auc: 0.500


### Fourth Log Reg: select features using non normal distribution

In [20]:
# separate train and test and MinMax Scaler
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.9, random_state=42)

# use minmax scaler not to generate negative numbers so I can yse chi2 for feature selection
X_train, X_test = minMax_standarisation(X_train, X_test) 
X_train, X_test = feat_select(VarianceThreshold(), X_train, X_test, Y_train)

# select 10 best features
X_train, X_test = feat_select(SelectKBest(chi2, k=10), X_train, X_test, Y_train)
X_train.shape

(9514, 10)

In [21]:
# run log reg model with 10 features
logit = logReg_mod(X_train, Y_train, X_test, Y_test)

Train set
Accuracy: 0.952
CrossVal: 0.952
Test set
Accuracy: 0.949
Auc: 0.500


In [22]:
# separate train and test and MinMax Scaler
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.9, random_state=42)

# use minmax scaler not to generate negative numbers so I can yse chi2 for feature selection
X_train, X_test = minMax_standarisation(X_train, X_test) 
X_train, X_test = feat_select(VarianceThreshold(), X_train, X_test, Y_train)

# select 5 best features
X_train, X_test = feat_select(SelectKBest(chi2, k=5), X_train, X_test, Y_train)
X_train.shape

(9514, 5)

In [23]:
logit = logReg_mod(X_train, Y_train, X_test, Y_test)

Train set
Accuracy: 0.952
CrossVal: 0.952
Test set
Accuracy: 0.949
Auc: 0.500


### First Random Forest Model: select 10% best features

In [24]:
# sep train and test, normalise, select top 10% features
# separate train and test and normalise
X_train, X_test, Y_train, Y_test = split_standarise(X,Y)
X_train, X_test = feat_select(VarianceThreshold(), X_train, X_test, Y_train)
X_train, X_test = feat_select(SelectPercentile(f_classif, percentile = 10), X_train, X_test, Y_train)
RF = rf_mod(X_train, Y_train, X_test, Y_test)

Train set
Accuracy: 1.000
CrossVal: 0.952
Test set
Accuracy: 0.949
Auc: 0.500


### Second Random Forest: Recursive Feature Elimination

In [25]:
# Recursive Feature Selection (RFS)
rf_RFS = SelectFromModel(RF, prefit=True)
X_train = rf_RFS.transform(X_train)
X_test = rf_RFS.transform(X_test)

X_train.shape

(9514, 46)

In [26]:
RF = rf_mod(X_train, Y_train, X_test, Y_test)

Train set
Accuracy: 1.000
CrossVal: 0.952
Test set
Accuracy: 0.949
Auc: 0.500


### Third Random Forest: select 10 features following feature importance

In [27]:
feat_imp = pd.DataFrame(RF.feature_importances_)
feat_imp.columns = ['feature']
ind = feat_imp.sort_values('feature', axis=0, ascending=False).head(10).index
feat_imp['Feat_name'] = pd.Series(X.columns[ind])
feat_imp.head(10)

Unnamed: 0,feature,Feat_name
0,0.022332,CHIL2
1,0.014696,AGE904
2,0.025689,ETH15
3,0.017335,POP90C4
4,0.016139,POP903
5,0.023681,ETH5
6,0.008233,POP90C2
7,0.012833,POP90C3
8,0.023291,AGE901
9,0.020775,ETH13


In [28]:
## best features according to feature importance
col_names = feat_imp.Feat_name.head(10)

In [29]:
## Reduce original dataset to 10 features following Random Forest Importance
new_X = X[col_names]
new_X.head()

Unnamed: 0,CHIL2,AGE904,ETH15,POP90C4,POP903,ETH5,POP90C2,POP90C3,AGE901,ETH13
0,42,40,0,47,332,11,35,65,39,11
1,46,32,1,50,998,6,0,0,34,2
2,40,37,0,49,2669,2,2,98,35,2
3,35,34,0,54,219,32,8,92,32,31
4,43,36,0,46,761,1,0,0,33,0


In [30]:
X_train, X_test, Y_train, Y_test = split_standarise(new_X,Y)
RF = rf_mod(X_train, Y_train, X_test, Y_test)

Train set
Accuracy: 0.994
CrossVal: 0.948
Test set
Accuracy: 0.945
Auc: 0.500


### Function for Random Forest with unbalanced samples

In [31]:
def rf_mod2(Xtrain, Ytrain, Xtest, Ytest, sample_weights):
    logit = RandomForestClassifier(n_estimators = 500, random_state = 1)

    # Train the model using the training sets
    logit.fit(Xtrain, Ytrain, sample_weight = sample_weight)
    predicted = logit.predict(Xtest)
    
    print("Train set")
    print("Accuracy: %.3f" % metrics.accuracy_score(Ytrain, logit.predict(Xtrain)))
    print('CrossVal: %.3f' % cross_validation.cross_val_score(logit, Xtrain, Ytrain, cv=5).mean())
    print('=================')
    print("Test set")
    print("Accuracy: %.3f" % metrics.accuracy_score(Ytest, predicted))
    print("Auc: %.3f" % roc_auc_score(Ytest, predicted))
    return logit

In [32]:
# regenerate train and test set, and select 10% best features
X_train, X_test, Y_train, Y_test = split_standarise(X,Y)
X_train, X_test = feat_select(VarianceThreshold(), X_train, X_test, Y_train)
X_train, X_test = feat_select(SelectPercentile(f_classif, percentile = 10), X_train, X_test, Y_train)

sample_weight = np.array([20 if i == 1 else 1 for i in Y_train])

In [33]:
RF = rf_mod2(X_train, Y_train, X_test, Y_test, sample_weight)

Train set
Accuracy: 1.000
CrossVal: 0.952
Test set
Accuracy: 0.949
Auc: 0.500


### XGBoost

In [34]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.grid_search import GridSearchCV   

In [35]:
X_train.shape

(9514, 177)

In [36]:
xgbparams = {'learning_rate' : 0.1, 
                     'n_estimators': 50,
                     'max_depth': 5,
                     'min_child_weight': 1,
                     'gamma': 0,
                     'subsample': 0.8,
                     'colsample_bytree': 0.8,
                     'objective' : 'binary:logistic',
                     'nthread': 4,
                     'scale_pos_weight': 1,
                     'seed' : 27}

### Optimise max_depth and min_child_weight

In [37]:
xgb_model = XGBClassifier(xgbparams, nthread=-1)

clf = GridSearchCV(
    xgb_model,
    {
        'max_depth': [1,3,5,7,9],
        'min_child_weight':[1,3,5,7]
    },
    cv=5,
    verbose=0,
    n_jobs=1,
    scoring = 'roc_auc'
)

clf.fit(X_train, Y_train)
clf.best_params_

{'max_depth': 3, 'min_child_weight': 1}

### Optimise subsample and colsample_bytree

In [38]:
#from sklearn.metrics import ndcg_scorer
xgbparams = {'learning_rate' : 0.1, 
                     'n_estimators': 50,
                     'max_depth': 3,
                     'min_child_weight': 1,
                     'gamma': 0,
                     'subsample': 0.8,
                     'colsample_bytree': 0.8,
                     'objective' : 'binary:logistic',
                     'nthread': 4,
                     'scale_pos_weight': 1,
                     'seed' : 27}

xgb_model = XGBClassifier(xgbparams, nthread=-1)

clf = GridSearchCV(
    xgb_model,
    {
    'subsample':[i/10 for i in range(6,10)],
    'colsample_bytree':[i/10 for i in range(6,10)],
    },
    cv=5,
    verbose=0,
    n_jobs=1,
    scoring = 'roc_auc'
)

clf.fit(X_train, Y_train)
clf.best_params_

{'colsample_bytree': 0.6, 'subsample': 0.6}

### Optimise regularisation rate

In [39]:
xgbparams = {'learning_rate' : 0.1, 
                     'n_estimators': 50,
                     'max_depth': 3,
                     'min_child_weight': 1,
                     'gamma': 0,
                     'subsample': 0.6,
                     'colsample_bytree': 0.6,
                     'objective' : 'binary:logistic',
                     'nthread': 4,
                     'scale_pos_weight': 1,
                     'seed' : 27}

xgb_model = XGBClassifier(xgbparams, nthread=-1)

clf = GridSearchCV(
    xgb_model,
    {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]},
    cv=5,
    verbose=0,
    n_jobs=1,
    scoring = 'roc_auc'
)

clf.fit(X_train, Y_train)
clf.best_params_

{'reg_alpha': 1e-05}

### Optimise learning rate and number of estimators

In [40]:
xgbparams = {'learning_rate' : 0.1, 
                     'n_estimators': 50,
                     'max_depth': 3,
                     'min_child_weight': 1,
                     'gamma': 0,
                     'subsample': 0.6,
                     'colsample_bytree': 0.6,
                     'objective' : 'binary:logistic',
                     'nthread': 4,
                     'scale_pos_weight': 1,
                     'reg_alpha': 1e-05,
                     'seed' : 27}

xgb_model = XGBClassifier(xgbparams, nthread=-1)

clf = GridSearchCV(
    xgb_model,
    {'learning_rate':[0.01, 0.03, 0.1, 0.3, 1],
    'n_estimators':[10, 100, 500, 1000]},
    cv=5,
    verbose=0,
    n_jobs=1,
    scoring = 'roc_auc'
)

clf.fit(X_train, Y_train)
clf.best_params_

{'learning_rate': 0.01, 'n_estimators': 10}

In [41]:
predictions_train = clf.predict(X_train)
predictions_test = clf.predict(X_test)

In [42]:
print("Train set")
print("Accuracy: %.3f" % metrics.accuracy_score(Y_train, predictions_train))
print('=================')
print("Test set")
print("Accuracy: %.3f" % metrics.accuracy_score(Y_test, predictions_test))
print("Auc: %.3f" % roc_auc_score(Y_test, predictions_test))

Train set
Accuracy: 0.952
Test set
Accuracy: 0.949
Auc: 0.500
