In [1]:
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split # to divide train and test set
from sklearn import preprocessing # for feature scaling

# feature selection
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression

# import linear model
from sklearn.linear_model import LinearRegression

# model evaluation
from sklearn import cross_validation

In [2]:
cd Dropbox/Portfolio/DataScience-Portfolio/KDD-1998

/Users/Capgemini/Dropbox/Portfolio/DataScience-Portfolio/KDD-1998


### Load data and separate in Classifier and Labels

In [3]:
# load data
kdd = pd.read_csv('kdd_reg_fe_small.csv')


# generate X and Y for preditions
Y = np.ravel(kdd['0'])  # to flatten array
X = kdd.drop('0', axis = 1)

In [4]:
X.shape

(4829, 1244)

### Functions for feature scaling

In [5]:
# Feature scaling - normalisation
def standarisation(train, test):
    scaler = preprocessing.StandardScaler().fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)
    return train, test

# Feature scaling - MinMax Scaler (scales between 0 and 1)
def minMax_standarisation(train, test):
    scaler = preprocessing.MinMaxScaler().fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)
    return train, test

### Function for feature selection

In [6]:
# feature selection function
def feat_select(model, xtrain, test, ytrain):
    selector = model
    selector.fit(xtrain, ytrain)
    X_train_new = selector.transform(xtrain)
    X_test_new = selector.transform(test)
    return X_train_new, X_test_new

### Function for Linear Regression

In [7]:
# Create linear regression object
def linReg_mod(Xtrain, Ytrain, Xtest, Ytest):
    regr = LinearRegression()

    # Train the model using the training sets
    regr.fit(Xtrain, Ytrain)
    
    # Measures for training set
    print("Train set")
    print("Residual sum of squares: %.2f" % np.mean((regr.predict(Xtrain) - Ytrain) ** 2))
    # Explained variance score: 1 is perfect prediction
    print('Rsquared: %.2f' % regr.score(Xtrain, Ytrain))
    print('=================')
    # Measures for testing set
    print("Test set")
    print("Residual sum of squares: %.2f" % np.mean((regr.predict(Xtest) - Ytest) ** 2))
    # Explained variance score: 1 is perfect prediction
    print('Rsquared: %.2f' % regr.score(Xtest, Ytest))
    return regr

### Function to print results of model

In [8]:
# Print results of model
def print_results(clf, X_train, Y_train, X_test, Y_test):
    print("Train set")
    print("Residual sum of squares: %.2f" % np.mean((clf.predict(X_train) - Y_train) ** 2))
    print('Rsquared: %.2f' % clf.score(X_train, Y_train))
    print('=================')
    print("Test set")
    print("Residual sum of squares: %.2f" % np.mean((clf.predict(X_test) - Y_test) ** 2))
    print('Rsquared: %.2f' % clf.score(X_test, Y_test))

### Function to split train and test set and normalise predictors

In [9]:
# function to split test and train and normalise
def split_standarise(X,Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)
    X_train, X_test = standarisation(X_train, X_test)
    return X_train, X_test, Y_train, Y_test

### First linear model

In [10]:
# separate testing and training set + normalise
X_train, X_test, Y_train, Y_test = split_standarise(X,Y)
X_train.shape, X_test.shape

((2897, 1244), (1932, 1244))

In [11]:
# select top 10% features
X_train, X_test = feat_select(SelectPercentile(f_regression, percentile = 10), X_train, X_test, Y_train)
X_train.shape, X_test.shape

((2897, 125), (1932, 125))

In [12]:
# run first linear model with 287 features
regr = linReg_mod(X_train, Y_train, X_test, Y_test)

Train set
Residual sum of squares: 51.86
Rsquared: 0.67
Test set
Residual sum of squares: 92.95
Rsquared: 0.38


Overfitting to the training set. We see however, that with these new set of engineered variables, the overfitting is less than in the previous notebook, with features asis

### Second linear model

In [13]:
# separate train and test and normalise
X_train, X_test, Y_train, Y_test = split_standarise(X,Y)

# select 10 best features
X_train, X_test = feat_select(SelectKBest(f_regression, k=10), X_train, X_test, Y_train)
X_train.shape

(2897, 10)

In [14]:
# run second linear model with 10 features
regr2 = linReg_mod(X_train, Y_train, X_test, Y_test)

Train set
Residual sum of squares: 68.21
Rsquared: 0.57
Test set
Residual sum of squares: 83.65
Rsquared: 0.44


Better model. Although the R squared is quite low.

### Third linear Model

In [15]:
# sep train and test, normalise, select top 10% features
X_train, X_test, Y_train, Y_test = split_standarise(X,Y)
X_train, X_test = feat_select(SelectPercentile(f_regression, percentile = 10), X_train, X_test, Y_train)

In [16]:
# Recursive Feature Selection (RFS)
regr_RFS = SelectFromModel(regr, prefit=True)
X_train = regr_RFS.transform(X_train)
X_test = regr_RFS.transform(X_test)

X_train.shape

(2897, 8)

In [17]:
regr3 = linReg_mod(X_train, Y_train, X_test, Y_test)

Train set
Residual sum of squares: 96.06
Rsquared: 0.39
Test set
Residual sum of squares: 101.07
Rsquared: 0.33


Overfits.

### Try different feature scaling (MinMax)

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

# normalise features with MinMax
X_train, X_test = minMax_standarisation(X_train, X_test)

# select 10 best features
X_train, X_test = feat_select(SelectKBest(f_regression, k=10), X_train, X_test, Y_train)
regr3 = linReg_mod(X_train, Y_train, X_test, Y_test)

Train set
Residual sum of squares: 68.21
Rsquared: 0.57
Test set
Residual sum of squares: 83.65
Rsquared: 0.44


No improvement with respect to first model with 10 best features

## Try normalizing label with log 

In [19]:
regr3 = linReg_mod(X_train, np.log(Y_train), X_test, np.log(Y_test))

Train set
Residual sum of squares: 0.16
Rsquared: 0.63
Test set
Residual sum of squares: 0.16
Rsquared: 0.59


This model is much better than its equivalent in the previous notebook, with an R squared of 0.59 here versus 0.51 previously. 

### Feature selection with grid search and BayesianRidge

In [20]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import BayesianRidge

X_train, X_test, Y_train, Y_test = split_standarise(X,Y)

ridge = BayesianRidge()      # model object
cv = KFold(len(Y_train), 2)  # cross-validation generator for model selection
anova = SelectPercentile(f_regression) # feature selection

clf = Pipeline([('anova', anova), ('ridge', ridge)])

# Select the optimal percentage of features with grid search
clf = GridSearchCV(clf, {'anova__percentile': [2, 5, 10, 20, 30, 50]}, cv=cv)
clf.fit(X_train, Y_train)  # set the best parameters

GridSearchCV(cv=sklearn.cross_validation.KFold(n=2897, n_folds=2, shuffle=False, random_state=None),
       error_score='raise',
       estimator=Pipeline(steps=[('anova', SelectPercentile(percentile=10,
         score_func=<function f_regression at 0x10fa78048>)), ('ridge', BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'anova__percentile': [2, 5, 10, 20, 30, 50]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [21]:
clf.best_params_

{'anova__percentile': 2}

In [22]:
print_results(clf, X_train, Y_train, X_test, Y_test)

Train set
Residual sum of squares: 64.73
Rsquared: 0.59
Test set
Residual sum of squares: 82.83
Rsquared: 0.45


This model is not better than the linear regression with 10 features.

### Same as above but with feature agglomeration

In [23]:
from sklearn.cluster import FeatureAgglomeration

X_train, X_test, Y_train, Y_test = split_standarise(X,Y)

ridge = BayesianRidge() # model object

# Ward agglomeration followed by BayesianRidge
ward = FeatureAgglomeration(n_clusters=10)
clf = Pipeline([('ward', ward), ('ridge', ridge)])
# Select the optimal number of parcels with grid search
clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv)
clf.fit(X_train, Y_train)  # set the best parameters

GridSearchCV(cv=sklearn.cross_validation.KFold(n=2897, n_folds=2, shuffle=False, random_state=None),
       error_score='raise',
       estimator=Pipeline(steps=[('ward', FeatureAgglomeration(affinity='euclidean', compute_full_tree='auto',
           connectivity=None, linkage='ward', memory=Memory(cachedir=None),
           n_clusters=10, n_components=None,
           pooling_func=<function mean at 0x10bb7f378>)), ('ridge', BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'ward__n_clusters': [10, 20, 30]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [24]:
clf.best_params_

{'ward__n_clusters': 30}

In [25]:
print_results(clf, X_train, Y_train, X_test, Y_test)

Train set
Residual sum of squares: 99.27
Rsquared: 0.37
Test set
Residual sum of squares: 109.83
Rsquared: 0.27


In [26]:
from sklearn.svm import SVR

X_train, X_test, Y_train, Y_test = split_standarise(X,Y)
X_train, X_test = feat_select(SelectPercentile(f_regression, percentile = 5), X_train, X_test, Y_train)

# SVM regularization parameter
for C in [0.01, .1, 1.0, 10]:
    # SVC with a Linear Kernel 
    svr = SVR(kernel='linear', C=C).fit(X_train, Y_train)
    print_results(svr, X_train, Y_train, X_test, Y_test)
    print()
    print()

Train set
Residual sum of squares: 69.79
Rsquared: 0.56
Test set
Residual sum of squares: 84.22
Rsquared: 0.44


Train set
Residual sum of squares: 60.56
Rsquared: 0.62
Test set
Residual sum of squares: 86.88
Rsquared: 0.42


Train set
Residual sum of squares: 60.53
Rsquared: 0.62
Test set
Residual sum of squares: 86.93
Rsquared: 0.42


Train set
Residual sum of squares: 60.53
Rsquared: 0.62
Test set
Residual sum of squares: 86.95
Rsquared: 0.42




## Conclusion

The linear regression model utilising 10 best features selected by univariate feature selection following the previous feature engineering beats the models developed in the previous notebook (kdd_regression). It offers at the same time a simpler model that allows understanding of the importance of the features in the predictions of the amount donated.

In [27]:
X_train, X_test, Y_train, Y_test = split_standarise(X,Y)

selector = SelectKBest(f_regression, k=10)
selector.fit(X_train, Y_train)
X_train_new = selector.transform(X_train)
X_test_new = selector.transform(X_test)

In [28]:
X_train_new.shape

(2897, 10)

In [29]:
pval = pd.Series(selector.pvalues_)
ind = pval.sort_values().head(10).index

In [30]:
# Dataframe containing the features utilised for the predictions in the linear model
feat = pd.DataFrame(X.columns[ind], columns = ['feature'])
feat['p_value']= pd.DataFrame(np.array(pval.sort_values().head(10)))
feat

Unnamed: 0,feature,p_value
0,MAXRAMNT_log,0.0
1,LASTGIFT_log,0.0
2,RFA_2_L1G,1.115719e-123
3,MINRAMNT_log,1.5660360000000002e-119
4,RFA_2F_log,1.416886e-89
5,MDMAUD_XXXX_asi,2.267747e-62
6,RFA_6_A1G,4.419824e-56
7,RFA_14_A1G,8.295002000000001e-52
8,RFA_6_L3C,2.3331649999999997e-50
9,RFA_2_L4D,3.687283e-49


The features tha strongly contribute to the prediction of the donation amount are the same as in the previous notebook, which are the maximum amount donated (MAXRAMNT), the amount donated last (LASTGIFT) and in this case instead of the average donated we have the mnimum amount donated (MINRAMNT). I believe that the average and the minimum amount donated are probably hihgly correlated, and we could possibly swap this variables without affecting the predictive power of the model.