In [1]:
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split # to divide train and test set
from sklearn import preprocessing # for feature scaling

# feature selection
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression

# import linear model
from sklearn.linear_model import LinearRegression

# model evaluation
from sklearn import cross_validation

In [2]:
cd Dropbox/Portfolio/DataScience-Portfolio/KDD-1998

/Users/Capgemini/Dropbox/Portfolio/DataScience-Portfolio/KDD-1998


### Load data and separate in Classifier and Labels

In [3]:
# load data
kdd = pd.read_csv('kdd_reg_fe_log_small.csv')


# generate X and Y for preditions
Y = np.ravel(kdd['0'])  # to flatten array
X = kdd.drop('0', axis = 1)

In [4]:
X.shape

(4829, 1244)

### Functions for feature scaling

In [5]:
# Feature scaling - normalisation
def standarisation(train, test):
    scaler = preprocessing.StandardScaler().fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)
    return train, test

# Feature scaling - MinMax Scaler (scales between 0 and 1)
def minMax_standarisation(train, test):
    scaler = preprocessing.MinMaxScaler().fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)
    return train, test

### Function for feature selection

In [6]:
# feature selection function
def feat_select(model, xtrain, test, ytrain):
    selector = model
    selector.fit(xtrain, ytrain)
    X_train_new = selector.transform(xtrain)
    X_test_new = selector.transform(test)
    return X_train_new, X_test_new

### Function for Linear Regression

In [7]:
# Create linear regression object
def linReg_mod(Xtrain, Ytrain, Xtest, Ytest):
    regr = LinearRegression()

    # Train the model using the training sets
    regr.fit(Xtrain, Ytrain)
    
    # Measures for training set
    print("Train set")
    print("Residual sum of squares: %.2f" % np.mean((regr.predict(Xtrain) - Ytrain) ** 2))
    # Explained variance score: 1 is perfect prediction
    print('Rsquared: %.2f' % regr.score(Xtrain, Ytrain))
    print('=================')
    # Measures for testing set
    print("Test set")
    print("Residual sum of squares: %.2f" % np.mean((regr.predict(Xtest) - Ytest) ** 2))
    # Explained variance score: 1 is perfect prediction
    print('Rsquared: %.2f' % regr.score(Xtest, Ytest))
    return regr

### Function to print results of model

In [8]:
# Print results of model
def print_results(clf, X_train, Y_train, X_test, Y_test):
    print("Train set")
    print("Residual sum of squares: %.2f" % np.mean((clf.predict(X_train) - Y_train) ** 2))
    print('Rsquared: %.2f' % clf.score(X_train, Y_train))
    print('=================')
    print("Test set")
    print("Residual sum of squares: %.2f" % np.mean((clf.predict(X_test) - Y_test) ** 2))
    print('Rsquared: %.2f' % clf.score(X_test, Y_test))

### Function to split train and test set and normalise predictors

In [9]:
# function to split test and train and normalise
def split_standarise(X,Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)
    X_train, X_test = standarisation(X_train, X_test)
    return X_train, X_test, Y_train, Y_test

### First linear model

In [10]:
# separate testing and training set + normalise
X_train, X_test, Y_train, Y_test = split_standarise(X,Y)
X_train.shape, X_test.shape

((2897, 1244), (1932, 1244))

In [11]:
# select top 10% features
X_train, X_test = feat_select(SelectPercentile(f_regression, percentile = 10), X_train, X_test, Y_train)
X_train.shape, X_test.shape

((2897, 125), (1932, 125))

In [12]:
# run first linear model with 287 features
regr = linReg_mod(X_train, Y_train, X_test, Y_test)

Train set
Residual sum of squares: 0.14
Rsquared: 0.66
Test set
Residual sum of squares: 0.17
Rsquared: 0.58


We observe a substantial less overfitting when we use 125 of these features, compared with the same model in the previous notebooks.

### Second linear model

In [13]:
# separate train and test and normalise
X_train, X_test, Y_train, Y_test = split_standarise(X,Y)

# select 10 best features
X_train, X_test = feat_select(SelectKBest(f_regression, k=10), X_train, X_test, Y_train)
X_train.shape

(2897, 10)

In [14]:
# run second linear model with 10 features
regr2 = linReg_mod(X_train, Y_train, X_test, Y_test)

Train set
Residual sum of squares: 0.16
Rsquared: 0.63
Test set
Residual sum of squares: 0.16
Rsquared: 0.59


Better model. Performance is similar to the one built on the previous notebook, used to estimate the log(donation amount)

### Third linear Model

In [15]:
# sep train and test, normalise, select top 10% features
X_train, X_test, Y_train, Y_test = split_standarise(X,Y)
X_train, X_test = feat_select(SelectPercentile(f_regression, percentile = 10), X_train, X_test, Y_train)

In [16]:
# Recursive Feature Selection (RFS)
regr_RFS = SelectFromModel(regr, prefit=True)
X_train = regr_RFS.transform(X_train)
X_test = regr_RFS.transform(X_test)

X_train.shape

(2897, 8)

In [17]:
regr3 = linReg_mod(X_train, Y_train, X_test, Y_test)

Train set
Residual sum of squares: 0.19
Rsquared: 0.56
Test set
Residual sum of squares: 0.19
Rsquared: 0.52


Not as good performance as the previous one. 

### Try different feature scaling (MinMax)

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

# normalise features with MinMax
X_train, X_test = minMax_standarisation(X_train, X_test)

# select 10 best features
X_train, X_test = feat_select(SelectKBest(f_regression, k=10), X_train, X_test, Y_train)
regr3 = linReg_mod(X_train, Y_train, X_test, Y_test)

Train set
Residual sum of squares: 0.16
Rsquared: 0.63
Test set
Residual sum of squares: 0.16
Rsquared: 0.59


Similar performance.

### Feature selection with grid search and BayesianRidge

In [19]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import BayesianRidge

X_train, X_test, Y_train, Y_test = split_standarise(X,Y)

ridge = BayesianRidge()      # model object
cv = KFold(len(Y_train), 2)  # cross-validation generator for model selection
anova = SelectPercentile(f_regression) # feature selection

clf = Pipeline([('anova', anova), ('ridge', ridge)])

# Select the optimal percentage of features with grid search
clf = GridSearchCV(clf, {'anova__percentile': [2, 5, 10, 20, 30, 50]}, cv=cv)
clf.fit(X_train, Y_train)  # set the best parameters

GridSearchCV(cv=sklearn.cross_validation.KFold(n=2897, n_folds=2, shuffle=False, random_state=None),
       error_score='raise',
       estimator=Pipeline(steps=[('anova', SelectPercentile(percentile=10,
         score_func=<function f_regression at 0x10fa7d048>)), ('ridge', BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'anova__percentile': [2, 5, 10, 20, 30, 50]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [20]:
clf.best_params_

{'anova__percentile': 2}

In [21]:
print_results(clf, X_train, Y_train, X_test, Y_test)

Train set
Residual sum of squares: 0.15
Rsquared: 0.64
Test set
Residual sum of squares: 0.16
Rsquared: 0.60


The Bayesian Ridge is slightly better than the linear regression.

### Support Vector Regression

In [22]:
from sklearn.svm import SVR

X_train, X_test, Y_train, Y_test = split_standarise(X,Y)
X_train, X_test = feat_select(SelectPercentile(f_regression, percentile = 5), X_train, X_test, Y_train)

# SVM regularization parameter
for C in [0.01, .1, 1.0, 10]:
    # SVC with a Linear Kernel 
    svr = SVR(kernel='linear', C=C).fit(X_train, Y_train)
    print_results(svr, X_train, Y_train, X_test, Y_test)
    print()
    print()

Train set
Residual sum of squares: 0.15
Rsquared: 0.64
Test set
Residual sum of squares: 0.16
Rsquared: 0.58


Train set
Residual sum of squares: 0.15
Rsquared: 0.64
Test set
Residual sum of squares: 0.16
Rsquared: 0.58


Train set
Residual sum of squares: 0.15
Rsquared: 0.64
Test set
Residual sum of squares: 0.16
Rsquared: 0.58


Train set
Residual sum of squares: 0.15
Rsquared: 0.64
Test set
Residual sum of squares: 0.16
Rsquared: 0.58




## Conclusion

The Bayesian Ridge seems to be the best model. It would need some model optimisation, so I will leave it for later and will go forward with the linear regression with 10 best features for the rest of the excercise.

The best model is the linear regression with the selected 10 best features, as it renders the lower sum of square errors and higher R squared in both training and test set. Here I will re-do the model to identify the 10 best features and their coefficients.

In [23]:
X_train, X_test, Y_train, Y_test = split_standarise(X,Y)

anova = SelectPercentile(f_regression, 2) # feature selection
anova.fit(X_train, Y_train)
X_train_new = anova.transform(X_train)
X_test_new = anova.transform(X_test)

In [24]:
X_train_new.shape

(2897, 25)

In [25]:
pval = pd.Series(anova.pvalues_)
pval.sort_values().head(10)
ind = pval.sort_values().head(10).index

In [26]:
feat = pd.DataFrame(X.columns[ind], columns = ['feature'])
feat['p_value']= pd.DataFrame(np.array(pval.sort_values().head(10)))
feat

Unnamed: 0,feature,p_value
0,MAXRAMNT_log,0.0
1,LASTGIFT_log,0.0
2,RFA_2F_asi,1.392417e-197
3,MINRAMNT_log,1.0488320000000001e-189
4,RFA_2_L4D,1.0280610000000001e-153
5,RFA_2_L1G,2.348069e-90
6,RFA_3_S4D,1.651415e-89
7,RFA_6_S4D,7.223653e-65
8,RFA_2A_F,5.346937e-56
9,PEPSTRFL,1.575762e-55


Same values as in kdd_regression_2nd_Attempt