In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import sklearn

from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
# Note - you will need version 0.24.1 of scikit-learn to load this library (SequentialFeatureSelector)
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('datasets/TTS_data.csv')
data = data.where(data['totalInstalledCost___'] != -1).dropna()
X = data[['systemSizeInDCSTC_KW_', 'Up_FrontCashIncentive___', 'azimuth_1', 'tilt_1', 
        'mod_nameplate_capacity1', 'inverterQuantity_1', 'inv_outputcapacity1', 
        'ILR', 'TotalModuleQty', 'latitude', 'longitude','mod_efficiency1']]

y = data['totalInstalledCost___']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.21, random_state=9, shuffle = True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
# Stepwise Selection

def forward_stepwise_selection(X, y, f_names):
    sfs_forward = SequentialFeatureSelector(linear_model.LinearRegression(), 
                                        n_features_to_select=5, 
                                        direction = 'forward').fit(X,y)
    print('support: ', sfs_forward.get_support(), "\n")
    selected = sfs_forward.get_support(indices = True)
    print(selected)
    print('Selected input features using Forward Stepwise Selection: \n', f_names[selected])
    
    return f_names[selected]

def backward_stepwise_selection(X, y, f_names):
    sfs_backward = SequentialFeatureSelector(linear_model.LinearRegression(), 
                                        n_features_to_select=5, 
                                        direction = 'backward').fit(X,y)
    print('support: ', sfs_backward.get_support(), "\n")
    selected = sfs_backward.get_support(indices = True)
    print(selected)
    print('Selected input features using Forward Stepwise Selection: \n', f_names[selected])
    
    return f_names[selected]

In [7]:
def different_values(list1, list2):
    return list(set(list1).symmetric_difference(set(list2)))

def unique_values(list1, list2):
    return list(set(list1) | (set(list2)))

In [8]:
# Regressors

# Random Forest Regressor
def random_forest_reg(X_train, y_train, X_test, y_test):
    clf_RF = RandomForestRegressor(n_estimators = 100, max_depth = 4, max_features = 2, random_state = 7)
    clf_RF = clf_RF.fit(X_train, y_train)
    y_predict = clf_RF.predict(X_test)
    mse_RF = mean_squared_error(y_test, y_predict)
    r2_RF = r2_score(y_test, y_predict)
    print(mse_RF)
    print(r2_RF)
    return r2_RF, y_predict

# Gradient Boosting Regressor
def gradient_boosting_reg(X_train, y_train, X_test, y_test):
    clf_GB = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, max_depth=1, random_state=7)
    clf_GB = clf_GB.fit(X_train, y_train)
    y_predict = clf_GB.predict(X_test)
    mse_GB = mean_squared_error(y_test, y_predict)
    r2_GB = r2_score(y_test, y_predict)
    print(mse_GB)
    print(r2_GB)
    
    return r2_GB
    
# Bagging Regressor
def bagging_reg():
    estimator = DecisionTreeRegressor(max_depth=4)
    clf_bag = BaggingRegressor(base_estimator=estimator, n_estimators=100, random_state=7)
    clf_bag = clf_bag.fit(X_train, y_train)
    y_predict = clf_bag.predict(X_test)
    mse_bag = mean_squared_error(y_test, y_predict)
    r2_bag = r2_score(y_test, y_predict)
    print(mse_bag)
    print(r2_bag)
    
    return r2_bag

In [6]:
names = 'systemSizeInDCSTC_KW_', 'Up_FrontCashIncentive___', 'azimuth_1', 'tilt_1', 'mod_nameplate_capacity1', 'inverterQuantity_1', 'inv_outputcapacity1', 'ILR', 'TotalModuleQty', 'latitude', 'longitude','mod_efficiency1'
f_names = np.array(names)

list1 = forward_stepwise_selection(X, y, f_names)
list2 = backward_stepwise_selection(X, y, f_names)
diff = different_values(list1, list2)
print(diff)

support:  [ True False False False  True False False False  True  True False  True] 

[ 0  4  8  9 11]
Selected input features using Forward Stepwise Selection: 
 ['systemSizeInDCSTC_KW_' 'mod_nameplate_capacity1' 'TotalModuleQty'
 'latitude' 'mod_efficiency1']
support:  [ True False False False  True False False  True  True False False  True] 

[ 0  4  7  8 11]
Selected input features using Forward Stepwise Selection: 
 ['systemSizeInDCSTC_KW_' 'mod_nameplate_capacity1' 'ILR' 'TotalModuleQty'
 'mod_efficiency1']


NameError: name 'difference' is not defined

In [12]:
import pandas as pd
import numpy as np
import sklearn

from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
# Note - you will need version 0.24.1 of scikit-learn to load this library (SequentialFeatureSelector)
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

import unittest

class TestCost(unittest.TestCase):
    """
    test cases
    """
    
    # tests if efficiency column contains any missing data
    def test_eff_missing_data(self):
        """
        Checks for missing data in efficiency column (indicted with -1)
        """
        missing_val = -1
        df = pd.read_csv('tests/TTS_sample.csv')
        if missing_val in df['mod_efficiency1']:
            self.assertRaises(ValueError)
        return
    
    # stepwise selection
    
    # forward stepwise selection
    def test_one_shot_test_forward_stepwise(self):
        """
        Forward stepwise selection: one shot test
        """
        df = pd.read_csv('tests/TTS_sample.csv')
        X = df[['systemSizeInDCSTC_KW_', 'Up_FrontCashIncentive___', 'azimuth_1', 'tilt_1', 
        'mod_nameplate_capacity1', 'inverterQuantity_1', 'inv_outputcapacity1', 
        'ILR', 'TotalModuleQty', 'latitude', 'longitude','mod_efficiency1']]
        y = df['totalInstalledCost___']
        
        names = 'systemSizeInDCSTC_KW_', 'Up_FrontCashIncentive___', 'azimuth_1', 'tilt_1', 'mod_BIPV1', 'mod_bifacial1', 'mod_nameplate_capacity1', 'mod_efficiency1', 'inverterQuantity_1', 'inv_microinv1', 'inv_battery_hybrid1', 'inv_builtin_meter1', 'inv_outputcapacity1', 'dc_optimizer', 'ILR', 'TotalModuleQty', 'latitude', 'longitude'
        f_names = np.array(names)
        forward_stepwise_selection(X, y, f_names)
        return
    
    def test_forward_stepwise_input(self):
        """
        Forward stepwise selection: Checks input type; only accepts int/float values
        """
        df = pd.read_csv('tests/TTS_sample.csv')
        df = df[['systemSizeInDCSTC_KW_', 'Up_FrontCashIncentive___', 'azimuth_1', 'tilt_1', 
        'mod_nameplate_capacity1', 'inverterQuantity_1', 'inv_outputcapacity1', 
        'ILR', 'TotalModuleQty', 'latitude', 'longitude','mod_efficiency1', 'totalInstalledCost___']]

        # check if string is present
        self.assertTrue(isinstance(x, (int, float)) for x in df)
        return
        
    # backward stepwise selection
    def test_one_shot_test_backward_stepwise(self):
        """
        Backward stepwise selection: one shot test
        """
        df = pd.read_csv('tests/TTS_sample.csv')
        X = df[['systemSizeInDCSTC_KW_', 'Up_FrontCashIncentive___', 'azimuth_1', 'tilt_1', 
        'mod_nameplate_capacity1', 'inverterQuantity_1', 'inv_outputcapacity1', 
        'ILR', 'TotalModuleQty', 'latitude', 'longitude','mod_efficiency1']]
        y = df['totalInstalledCost___']
        
        names = 'systemSizeInDCSTC_KW_', 'Up_FrontCashIncentive___', 'azimuth_1', 'tilt_1', 'mod_BIPV1', 'mod_bifacial1', 'mod_nameplate_capacity1', 'mod_efficiency1', 'inverterQuantity_1', 'inv_microinv1', 'inv_battery_hybrid1', 'inv_builtin_meter1', 'inv_outputcapacity1', 'dc_optimizer', 'ILR', 'TotalModuleQty', 'latitude', 'longitude'
        f_names = np.array(names)
        backward_stepwise_selection(X, y, f_names)
        return
    
    def test_backward_stepwise_input(self):
        """
        Backward stepwise selection: Checks input type; only accepts int/float values
        """
        df = pd.read_csv('tests/TTS_sample.csv')
        df = df[['systemSizeInDCSTC_KW_', 'Up_FrontCashIncentive___', 'azimuth_1', 'tilt_1', 
        'mod_nameplate_capacity1', 'inverterQuantity_1', 'inv_outputcapacity1', 
        'ILR', 'TotalModuleQty', 'latitude', 'longitude','mod_efficiency1', 'totalInstalledCost___']]

        self.assertTrue(isinstance(x, (int, float)) for x in df)
        return
    
    # different values
    def test_one_shot_test_different(self):
        """
        Different values: one shot test
        """
        alist = [1,3,5]
        blist = [1,3,6]
        self.assertTrue(different_values(alist, blist))
        
        return
    
    def test_pattern_test_different(self):
        """
        Different values: pattern test
        """
        alist = [1,3,5]
        blist = [1,3,6]
        diff_list = [5,6]
        
        if np.allclose(different_values(alist, blist), diff_list):
            print('Passed Pattern Test for Different Values')
        
        return
    
    # unique values
    def test_one_shot_test_unique(self):
        """
        Unique values: one shot test
        """
        alist = [1,3,5]
        blist = [1,3,6]
        self.assertTrue(unique_values(alist, blist))
        
        return
    
    def test_pattern_test_unique(self):
        """
        Unique values: pattern test
        """
        alist = [1,3,5]
        blist = [1,3,6]
        diff_list = [1,3,5,6]
        
        if np.allclose(unique_values(alist, blist), diff_list):
            print('Passed Pattern Test for Unique Values')
        
        return
    
    # random forest regressor
    def test_one_shot_rf(self):
        """
        Random Forest Regressor: One shot test
        """
        df = pd.read_csv('tests/TTS_sample.csv')
        X = df[['systemSizeInDCSTC_KW_', 'Up_FrontCashIncentive___', 'azimuth_1', 'tilt_1', 
        'mod_nameplate_capacity1', 'inverterQuantity_1', 'inv_outputcapacity1', 
        'ILR', 'TotalModuleQty', 'latitude', 'longitude','mod_efficiency1']]
        y = df['totalInstalledCost___']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.21, random_state=9, shuffle = True)
        
        self.assertTrue(random_forest_reg(X_train, y_train, X_test, y_test))
        return

suite = unittest.TestLoader().loadTestsFromTestCase(TestCost)
_ = unittest.TextTestRunner().run(suite)

....

11029149901.469948
0.2667907142955924


..

support:  [ True  True False False False  True  True False  True False False False] 

[0 1 5 6 8]
Selected input features using Forward Stepwise Selection: 
 ['systemSizeInDCSTC_KW_' 'Up_FrontCashIncentive___' 'mod_bifacial1'
 'mod_nameplate_capacity1' 'inverterQuantity_1']


....

support:  [False  True  True False False False  True False  True  True False False] 

[1 2 6 8 9]
Selected input features using Forward Stepwise Selection: 
 ['Up_FrontCashIncentive___' 'azimuth_1' 'mod_nameplate_capacity1'
 'inverterQuantity_1' 'inv_microinv1']
Passed Pattern Test for Different Values
Passed Pattern Test for Unique Values



----------------------------------------------------------------------
Ran 10 tests in 0.934s

OK
