In [28]:
import numpy as np
import pandas as pd
from subprocess import check_output
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score as auc
import time
from sklearn.ensemble import ExtraTreesClassifier

## Load Data and Remove Constant and Duplicate Columns

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# Remove Constant Columns 
columnsToRemove = []
for col in train.columns:
    if train[col].std() == 0:
        columnsToRemove.append(col)
    
train.drop(columnsToRemove, axis=1, inplace=True)

# Remove duplicate Columns 

columnsToRemove = []
columns = train.columns
for i in range(len(columns)-1):
    v = train[columns[i]].values
    for j in range(i+1, len(columns)):
        if np.array_equal(v, train[columns[j]].values):
            columnsToRemove.append(columns[j])
            

train.drop(columnsToRemove, axis=1, inplace=True)

target = train['TARGET']
features = train.drop(['ID','TARGET'], axis=1)

In [6]:
verySimpleLearner = ensemble.GradientBoostingClassifier(n_estimators=10, max_features=1, max_depth=3,
                    min_samples_leaf=100,learning_rate=0.3, subsample=0.65, loss='deviance', random_state=1)
X_train,X_valid,y_train,y_valid = cross_validation.train_test_split(features, target, test_size=0.5, random_state=1)
startTime = time.time()

singleFeatureTable = pd.DataFrame(index=range(len(X_train.columns)), columns = ['feature','AUC'])
for k,feature in enumerate(X_train.columns):
    trainInputFeature = X_train[feature].values.reshape(-1,1)
    validInputFeature = X_train[feature].values.reshape(-1,1)
    verySimpleLearner.fit(trainInputFeature,y_train)
    
    trainAUC = auc(y_train, verySimpleLearner.predict_proba(trainInputFeature)[:,1])
    validAUC = auc(y_train, verySimpleLearner.predict_proba(trainInputFeature)[:,1])
    singleFeatureTable.ix[k,'feature'] = feature
    singleFeatureTable.ix[k,'AUC'] = validAUC
    
   
    
print("finished evaluating single features. took %.2f minutes" %((time.time()-startTime)/60))


finished evaluating single features. took 0.88 minutes


In [5]:
 # show the scatter plot of the individual feature performance 
plt.figure(); plt.hist(validAUC, 50, normed=1, facecolor='blue', alpha=0.75)
plt.xlabel('AUC'); plt.ylabel('frequency'); plt.title('single feature AUC histogram');plt.show()

IndentationError: unexpected indent (<ipython-input-5-7ed7da15fddc>, line 4)

In [5]:
singleFeatureTable = singleFeatureTable.sort('AUC', axis=0, ascending=False).reset_index(drop=True)
singleFeatureTable

Unnamed: 0,feature,AUC
0,saldo_var30,0.7249929
1,var15,0.7189204
2,saldo_var42,0.7134969
3,saldo_medio_var5_hace2,0.7112311
4,saldo_medio_var5_ult3,0.7068748
5,saldo_medio_var5_ult1,0.7051733
6,saldo_var5,0.699792
7,num_meses_var5_ult3,0.6940056
8,num_var35,0.6932585
9,num_var4,0.6930124


## Generate 400 five wise combination features and select the top most according to the AUC score

In [7]:
# find interesting five wise combinations 
numFeatureInCombination = 5
numCombinations = 400
numBestSingleFeatureToSelectFrom = 20

X_train,X_valid,y_train,y_valid = cross_validation.train_test_split(features, target, test_size=0.5, random_state=1)

weakLearner = ensemble.GradientBoostingClassifier(n_estimators=30, max_features=2, max_depth=3, min_samples_leaf=100,
                                                 subsample=0.65, loss='deviance', random_state=1)
featureToUse = singleFeatureTable.ix[0:numBestSingleFeatureToSelectFrom-1,'feature']
featureColumnNames = ['feature' + str(x+1) for x in range(numFeatureInCombination)]
featureCombinationTable = pd.DataFrame(index=range(numCombinations), columns = featureColumnNames + ['combinedAUC'])

# for combination iteration

startTime = time.time()
for combination in range(numCombinations):
    # generate random feature combination
    randomSelectionFeatures = sorted(np.random.choice(len(featureToUse), numFeatureInCombination, replace=False))
    
    
    # Store the feature names
    combinationFeatureNames = [featureToUse[x] for x in randomSelectionFeatures]
    for i in range(len(randomSelectionFeatures)):
        featureCombinationTable.ix[combination,featureColumnNames[i]] = combinationFeatureNames[i]
    
    
    # build features matrix to get the combination AUC
    trainInputFeatures = X_train.ix[:,combinationFeatureNames]
    validInputFeatures = X_valid.ix[:,combinationFeatureNames]
    
    # train learner
    weakLearner.fit(trainInputFeatures, y_train)
    # store AUC results
    validAUC = auc(y_valid, weakLearner.predict_proba(validInputFeatures)[:,1])
    featureCombinationTable.ix[combination,'combinedAUC'] = validAUC

validAUC = np.array(featureCombinationTable.ix[:,'combinedAUC'])
print("(min,max) AUC = (%.4f,%.4f). took %.1f minutes" % (validAUC.min(),validAUC.max(), (time.time()-startTime)/60))


(min,max) AUC = (0.6959,0.8274). took 4.1 minutes


In [None]:

# show the histogram of the feature combinations performance 
plt.figure(); plt.hist(validAUC, 100, facecolor='blue', alpha=0.75)
plt.xlabel('AUC'); plt.ylabel('frequency'); plt.title('feature combination AUC histogram'); 
        plt.show()

In [6]:
featureCombinationTable = featureCombinationTable.sort('combinedAUC', axis=0, ascending=False).reset_index(drop=True)
featureCombinationTable.ix[:20,:]

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,combinedAUC
0,saldo_var30,var15,saldo_medio_var5_hace2,num_var30,var38,0.827516
1,saldo_var30,var15,saldo_medio_var5_hace2,num_var4,var38,0.8274565
2,saldo_var30,var15,num_meses_var5_ult3,var36,var38,0.8271732
3,saldo_var30,var15,saldo_medio_var5_hace2,saldo_medio_var5_ult3,var38,0.8271731
4,saldo_var30,var15,saldo_medio_var5_hace3,var38,num_meses_var39_vig_ult3,0.8262427
5,saldo_var30,var15,ind_var30,ind_var5,var38,0.8250688
6,saldo_var30,var15,saldo_var42,saldo_var5,var38,0.8249834
7,saldo_var30,var15,saldo_medio_var5_hace2,num_var4,num_var45_hace2,0.8196213
8,saldo_var30,var15,saldo_var42,num_var35,num_meses_var39_vig_ult3,0.8192213
9,saldo_var30,var15,num_var4,var36,num_var45_hace2,0.8184344


#### Its easy to see that lot of obverlapping features

## Visualise the same by building Pairwise Overlap Matrix

In [40]:
combinationOverlapMatrix = np.zeros((numCombinations,numCombinations))

for comb_i in range(numCombinations):
    for comb_j in range(comb_i+1,numCombinations):
        # get the features list for each combination
        featuresComb_i = [featureCombinationTable.ix[comb_i,featureColumnNames[x]] for x in range(numFeatureInCombination)]
        featuresComb_j = [featureCombinationTable.ix[comb_j,featureColumnNames[x]] for x in range(numFeatureInCombination)]
        # store the number of ovaerlapping features
        combinationOverlapMatrix[comb_i,comb_j] = 2*numFeatureInCombination-len(set(featuresComb_i+featuresComb_j))
        combinationOverlapMatrix[comb_j,comb_i] = combinationOverlapMatrix[comb_i,comb_j]

In [10]:
plt.figure(); plt.imshow(combinationOverlapMatrix,cmap='autumn'); plt.title('combination overlap'); plt.colorbar()

<matplotlib.colorbar.Colorbar instance at 0x000000002C451288>

In [23]:
featureNames = ['saldo_var30','var15','saldo_var5','ind_var30','var38']
finalFeatures = features[featureNames]
testData = test[featureNames]
X_train,X_valid,y_train,y_valid = cross_validation.train_test_split(finalFeatures, target, test_size=0.8, random_state=1)

model = ensemble.GradientBoostingClassifier(n_estimators=30, max_features=2, max_depth=3, min_samples_leaf=100,
                                                 subsample=0.65, loss='deviance', random_state=1)

model.fit(X_train, y_train)
validAUC = auc(y_valid, model.predict_proba(X_valid)[:,1])
print validAUC

0.821969710944


In [42]:
predicted_target = model.predict_proba(testData)

In [43]:
test['TARGET']= predicted_target
submit = ['ID','TARGET']

In [44]:
submit_file = test[submit]
submit_file.to_csv('submission.csv')

In [24]:
model = ExtraTreesClassifier(n_estimators=30,max_features=2, max_depth=3, min_samples_leaf=100)
model.fit(X_train, y_train)
validAUC = auc(y_valid, model.predict_proba(X_valid)[:,1])
print validAUC

0.756882224304


In [26]:
model.feature_importances_

array([  1.76324601e-03,   1.38824238e-01,   1.78754218e-04,
         5.53223385e-01,   6.01037698e-03])