In [21]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import *
from sklearn.linear_model import SGDRegressor
from sklearn import ensemble
from sklearn import metrics
from sklearn.datasets import load_boston
import warnings
warnings.filterwarnings("ignore")

In [22]:
boston=load_boston()
print(boston.data.shape)
print(boston.feature_names)
print(np.max(boston.target),np.min(boston.target),np.mean(boston.target))

(506, 13)
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
50.0 5.0 22.532806324110677


In [23]:
X_train,X_test,y_train,y_test=train_test_split(boston.data,boston.target,test_size=0.25,random_state=33)
scalerX = StandardScaler().fit(X_train)
scalery = StandardScaler().fit(np.reshape(y_train,(-1,1)))

X_train = scalerX.transform(X_train)
y_train = scalery.transform(np.reshape(y_train,(-1,1)))
X_test = scalerX.transform(X_test)
y_test = scalery.transform(np.reshape(y_test,(-1,1)))

print (np.max(X_train), np.min(X_train), np.mean(X_train), np.max(y_train), np.min(y_train), np.mean(y_train))
# converting reshaped Y vector into array again
y_train=y_train.flatten(order='C')
y_test=y_test.flatten(order='C')

10.202898004591216 -4.6670204084548 2.4703870638462586e-15 2.9177492036731256 -1.931470986413033 3.5855223803197665e-16


In [24]:
def train_and_evaluate(clf, X_train, y_train):
    
    clf.fit(X_train, y_train)
    
    print ("Coefficient of determination on training set:",clf.score(X_train, y_train))
    
    cv = KFold(X_train.shape[0], 5, shuffle=True, random_state=33)
    scores = cross_val_score(clf, X_train, y_train, cv=cv)
    print ("Average coefficient of determination using 5-fold crossvalidation:",np.mean(scores))

In [36]:
def feature_importance_chart(clf, classifier_name, feature_names):
    sorted_feature_importances, sorted_feature_names = (
        zip(*sorted(zip(clf.feature_importances_, feature_names)))
    )
    plt.figure(figsize=(16, 9))
    plt.barh(range(len(sorted_feature_importances)), sorted_feature_importances)
    plt.yticks(
        range(len(sorted_feature_importances)),
        ["{}: {:.3}".format(a, b) for a, b in zip(sorted_feature_names, sorted_feature_importances)]
    )
    plt.title("The Gini feature importance for the {} \n"
              "(total decrease in node impurity, weighted by the "
              "probability of reaching that node)".format(classifier_name))
    plt.show()

In [25]:
clf1=SGDRegressor(loss='squared_loss', penalty=None,  random_state=33)
train_and_evaluate(clf1,X_train,y_train)
print (clf1.coef_)

Coefficient of determination on training set: 0.7402817036889138
Average coefficient of determination using 5-fold crossvalidation: 0.7136305962554014
[-0.07634694  0.06117706 -0.03404977  0.1076101  -0.06620428  0.35855438
 -0.0098127  -0.21344242  0.0921319  -0.03985987 -0.18753121  0.05267773
 -0.37137355]


In [26]:
clf2 = SGDRegressor(loss='squared_loss', penalty='l2',  random_state=42)
train_and_evaluate(clf2,X_train,y_train)

Coefficient of determination on training set: 0.7436167432081849
Average coefficient of determination using 5-fold crossvalidation: 0.7108120666700934


In [27]:
clf3 = SGDRegressor(loss='squared_loss', penalty='l1',  random_state=42)
train_and_evaluate(clf3,X_train,y_train)

Coefficient of determination on training set: 0.7435869229095474
Average coefficient of determination using 5-fold crossvalidation: 0.7107636098744696


In [28]:
# Use of SVM for regression

from sklearn import svm
svr1= svm.SVR(kernel='linear')
train_and_evaluate(svr1,X_train,y_train)

Coefficient of determination on training set: 0.7188692334197399
Average coefficient of determination using 5-fold crossvalidation: 0.7078384191936957


In [29]:
svr2=svm.SVR(kernel='poly')
train_and_evaluate(svr2,X_train,y_train)

Coefficient of determination on training set: 0.9041092733006806
Average coefficient of determination using 5-fold crossvalidation: 0.7792885454878019


In [30]:
svr3=svm.SVR(kernel='rbf')
train_and_evaluate(svr3,X_train,y_train)

Coefficient of determination on training set: 0.9001320659785192
Average coefficient of determination using 5-fold crossvalidation: 0.8336622215665088


In [31]:
# Use of Random Forest regressor

In [32]:

et1=ensemble.ExtraTreesRegressor(n_estimators=10,random_state=42)
train_and_evaluate(et1,X_train,y_train)

Coefficient of determination on training set: 1.0
Average coefficient of determination using 5-fold crossvalidation: 0.8617589783439273


In [33]:
important=zip(et1.feature_importances_,boston.feature_names)
print (sorted(important))

[(0.005043853202755884, 'ZN'), (0.015142513715149682, 'B'), (0.017052578400506287, 'AGE'), (0.018941821085751577, 'RAD'), (0.023602561777571307, 'CHAS'), (0.025733049004581798, 'CRIM'), (0.03187416223510046, 'NOX'), (0.03440564493930893, 'INDUS'), (0.039713133345196064, 'DIS'), (0.046618521397262996, 'TAX'), (0.09951180149276224, 'PTRATIO'), (0.28421522796368465, 'LSTAT'), (0.3581451314403682, 'RM')]


In [34]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True,
                        show_confusion_matrix=True, show_r2_score=False):
    y_pred=clf.predict(X)   
    if show_accuracy:
        print ("Accuracy:{0:.3f}".format(metrics.accuracy_score(y,y_pred)),"\n")

    if show_classification_report:
        print ("Classification report")
        print (metrics.classification_report(y,y_pred),"\n")
        
    if show_confusion_matrix:
        print ("Confusion matrix")
        print (metrics.confusion_matrix(y,y_pred),"\n")
        
    if show_r2_score:
        print ("Coefficient of determination:{0:.3f}".format(metrics.r2_score(y,y_pred)),"\n")

In [35]:
measure_performance(X_test,y_test,et1, show_accuracy=False, show_classification_report=False,
                    show_confusion_matrix=False, show_r2_score=True)

Coefficient of determination:0.802 

