In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
import timeit

In [6]:
from sklearn.linear_model import LinearRegression

In [7]:
from sklearn import metrics

In [8]:
data = pd.read_csv('MinMaxScaledData.csv')
X = data.drop('Appliances', axis=1)
y = data['Appliances']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [44]:
def getScore(X_train, y_train, X_test, y_test) :
    startTime = timeit.default_timer()
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    scaler = StandardScaler()
    scaler.fit(X_test)
    X_test = scaler.transform(X_test)
    lg = LinearRegression()
    lg.fit(X_train, y_train)
    predictions = lg.predict(X_test)
    elapsedTime = timeit.default_timer() - startTime
    print("time taken : ",elapsedTime)
    MAE = metrics.mean_absolute_error(y_test, predictions)
    MSE = metrics.mean_squared_error(y_test, predictions)
    RMSE = np.sqrt(MSE)
    print('Mean Absolute Error : ',MAE)
    print('Mean Squared Error : ',MSE)
    print('Root Mean Squared Error : ',RMSE)

In [47]:
getScore(X_train, y_train, X_test, y_test)

time taken :  0.06671578067084738
Mean Absolute Error :  0.04936621336758118
Mean Squared Error :  0.0076519014212262666
Root Mean Squared Error :  0.08747514744901129


In [17]:
scoreTable = data.corr(method='pearson')

In [18]:
data.corr(method='pearson').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

Unnamed: 0.1,Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
Unnamed: 0,1.0,-0.0096,-0.19,0.7,-0.1,0.52,-0.093,0.79,-0.43,0.78,-0.26,0.78,-0.24,0.6,-0.82,0.83,-0.17,0.78,-0.28,0.89,-0.24,0.59,-0.027,-0.34,-0.26,-0.096,0.47,0.00028,0.00028
Appliances,-0.0096,1.0,0.2,0.055,0.086,0.12,-0.06,0.085,0.036,0.04,0.017,0.02,0.007,0.12,-0.083,0.026,-0.056,0.04,-0.094,0.01,-0.051,0.099,-0.035,-0.15,0.087,0.00023,0.015,-0.011,-0.011
lights,-0.19,0.2,1.0,-0.024,0.11,-0.0056,0.051,-0.097,0.13,-0.0089,0.11,-0.079,0.14,-0.079,0.15,-0.14,0.035,-0.071,0.013,-0.16,-0.0088,-0.074,-0.011,0.069,0.06,0.02,-0.036,0.00052,0.00052
T1,0.7,0.055,-0.024,1.0,0.16,0.84,-0.0025,0.89,-0.029,0.88,0.098,0.89,-0.015,0.65,-0.62,0.84,0.14,0.83,-0.0064,0.84,0.072,0.68,-0.15,-0.35,-0.088,-0.076,0.57,-0.0062,-0.0062
RH_1,-0.1,0.086,0.11,0.16,1.0,0.27,0.8,0.25,0.84,0.11,0.88,0.21,0.3,0.32,0.25,0.021,0.8,-0.03,0.74,0.12,0.76,0.34,-0.29,0.27,0.2,-0.021,0.64,-0.0007,-0.0007
T2,0.52,0.12,-0.0056,0.84,0.27,1.0,-0.17,0.74,0.12,0.76,0.23,0.72,0.03,0.8,-0.58,0.66,0.23,0.58,0.069,0.68,0.16,0.79,-0.13,-0.51,0.052,-0.07,0.58,-0.011,-0.011
RH_2,-0.093,-0.06,0.051,-0.0025,0.8,-0.17,1.0,0.14,0.68,-0.047,0.72,0.11,0.25,-0.0097,0.39,-0.051,0.69,-0.041,0.68,0.055,0.68,0.034,-0.26,0.58,0.069,-0.0054,0.5,0.0063,0.0063
T3,0.79,0.085,-0.097,0.89,0.25,0.74,0.14,1.0,-0.011,0.85,0.12,0.89,-0.066,0.69,-0.65,0.85,0.17,0.8,0.044,0.9,0.13,0.7,-0.19,-0.28,-0.1,-0.1,0.65,-0.0052,-0.0052
RH_3,-0.43,0.036,0.13,-0.029,0.84,0.12,0.68,-0.011,1.0,-0.14,0.9,-0.05,0.38,0.077,0.51,-0.25,0.83,-0.28,0.83,-0.2,0.83,0.12,-0.23,0.36,0.26,0.017,0.41,-0.00048,-0.00048
T4,0.78,0.04,-0.0089,0.88,0.11,0.76,-0.047,0.85,-0.14,1.0,-0.049,0.87,-0.076,0.65,-0.7,0.88,0.044,0.8,-0.095,0.89,-0.026,0.66,-0.075,-0.39,-0.19,-0.1,0.52,-0.0018,-0.0018


In [19]:
FeatureIndices = scoreTable.index.tolist()

In [20]:
def sortDic(UnsortedDictionary):
    dicItems = [(value,key) for key,value in UnsortedDictionary.items()]
    dicItems.sort()
    dicItems.reverse()
    dicItems = {key:value for value,key in dicItems}
    return dicItems

In [21]:
FeatureScoresForTargetVariable = {}

In [22]:
for i in FeatureIndices[:-1]:
    FeatureScoresForTargetVariable[i] = scoreTable.loc[i][FeatureIndices[-1]]

In [23]:
FeatureScoresForTargetVariable = sortDic(FeatureScoresForTargetVariable)
print(FeatureScoresForTargetVariable)

{'rv1': 1.0, 'RH_out': 0.020440677989887194, 'RH_6': 0.011671356607227864, 'RH_2': 0.006274721420446199, 'RH_8': 0.004479663880348605, 'RH_7': 0.0018197097244170742, 'Press_mm_hg': 0.0006994628686143647, 'lights': 0.0005211145475995473, 'Unnamed: 0': 0.0002767759275834023, 'RH_3': -0.00047746661879620233, 'RH_1': -0.0006993158716300153, 'T9': -0.001227152784178838, 'RH_4': -0.0017865022666929298, 'T4': -0.001814683501881885, 'RH_9': -0.0029545009141303926, 'T8': -0.003210299136853345, 'T7': -0.003899654480701556, 'Tdewpoint': -0.003950344994781031, 'T3': -0.0051944444404550035, 'T5': -0.005490221143687724, 'Visibility': -0.0058885619417277804, 'T1': -0.006203015240115718, 'RH_5': -0.011056224233276946, 'T2': -0.01108735158024511, 'Appliances': -0.011144917533741548, 'Windspeed': -0.01134182833285642, 'T6': -0.015086186713735456, 'T_out': -0.015258810905309127}


In [30]:
errorRate = []
#FOR CHECKING THE SCORE FOR EACH SELECTED FEATURE SUBSET
def trainTestScore(X_train, y_train, X_test, y_test):
    startTime = timeit.default_timer()
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    scaler = StandardScaler()
    scaler.fit(X_test)
    X_test = scaler.transform(X_test)
    lg = LinearRegression()
    lg.fit(X_train, y_train)
    predictions = lg.predict(X_test)
    elapsedTime = timeit.default_timer() - startTime
    print("time taken : ",elapsedTime)
    error = metrics.mean_squared_error(y_test, predictions)
    print('\tError rate : ',error)
    errorRate.append(error)
    return error

In [31]:
Features = list(FeatureScoresForTargetVariable.keys())
def selectNextBestFeature(FeatureSubset, featureIndex) :
    while featureIndex<len(Features):
        checkFeature = Features[featureIndex]
        noMatchWithAnyone = True
        if len(FeatureSubset) == 0:
            noMatchWithAnyone = True
        else:
            for feature in FeatureSubset:
                print('feature:',feature,'\tkey:',checkFeature)
                if(scoreTable.loc[str(feature)][checkFeature] >= 0.7):
                    noMatchWithAnyone = False
                    break          
        if(noMatchWithAnyone == True):
            return checkFeature, featureIndex
        featureIndex += 1

In [32]:
def scoresForFeatures():
    noOfFeatures = 3
#     while noOfFeatures <=36:
    i = 0
    scores = []
    FinalSubset = []
    LeastError = 100
    featureIndex = -1
    while i<noOfFeatures and featureIndex<len(Features):
        Subset = FinalSubset[:]
        feature, featureIndex = selectNextBestFeature(FinalSubset, featureIndex+1 )
        Subset.append(feature)
        error = trainTestScore(X_train.loc[:,Subset], y_train, X_test.loc[:, Subset], y_test)
        if(error < LeastError):
            FinalSubset = Subset[:]
            LeastError = error
            i += 1
        print("feature subset : ",Subset, "error : ", error)
        print("\n\n\nBest feature subset : ",FinalSubset, "error : ", LeastError)
    return FinalSubset

In [33]:
bestFeatureSubset = scoresForFeatures()

time taken :  0.12487662239777819
	Error rate :  0.009151267198559571
feature subset :  ['rv1'] error :  0.009151267198559571



Best feature subset :  ['rv1'] error :  0.009151267198559571
feature: rv1 	key: RH_out
time taken :  0.006930763476248103
	Error rate :  0.0089255530169162
feature subset :  ['rv1', 'RH_out'] error :  0.0089255530169162



Best feature subset :  ['rv1', 'RH_out'] error :  0.0089255530169162
feature: rv1 	key: RH_6
feature: RH_out 	key: RH_6
feature: rv1 	key: RH_2
feature: RH_out 	key: RH_2
time taken :  0.00848667237448808
	Error rate :  0.008914905963070978
feature subset :  ['rv1', 'RH_out', 'RH_2'] error :  0.008914905963070978



Best feature subset :  ['rv1', 'RH_out', 'RH_2'] error :  0.008914905963070978


In [48]:
getScore(X_train.loc[:,bestFeatureSubset], y_train, X_test.loc[:, bestFeatureSubset], y_test)

time taken :  0.004402055961691076
Mean Absolute Error :  0.055304303511886393
Mean Squared Error :  0.008914905963070978
Root Mean Squared Error :  0.09441877971606591


<h5>Here comparatively less time(0.0044ms compared to 0.0667ms taking only 7% of the time taken by the whole dataset)  is taken by the algorithm with the cost of slightly higher error (0.0944 compared to 0.0874 from the whole dataset).</h5>