## Get all imports

In [1]:
import math
import numpy as np
import pandas as pd
from collections import Counter
from PPFS import PPIMBR
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

## Load data

In [2]:
data = pd.read_csv("../data/desharnais/desharnais.csv")
data = data.drop(["id"], axis=1)
display(data.head())
data, Y = data.drop(["Effort"], axis=1), data["Effort"].values

print("Data shape: ", data.shape, "Target Variable shape: ", Y.shape)

Unnamed: 0,Project,TeamExp,ManagerExp,YearEnd,Length,Effort,Transactions,Entities,PointsNonAdjust,Adjustment,PointsAjust,Language
0,1,1,4,85,12,5152,253,52,305,34,302,1
1,2,0,0,86,4,5635,197,124,321,33,315,1
2,3,4,4,85,1,805,40,60,100,18,83,1
3,4,0,0,86,5,3829,200,119,319,30,303,1
4,5,0,0,86,4,2149,140,94,234,24,208,1


Data shape:  (81, 11) Target Variable shape:  (81,)


## InnerCv for DecisionTree

In [3]:
def inner_cv_dt(X, Y):
    kfold = KFold(n_splits=5, random_state=27, shuffle=True)
    scores = list()
    for train, test in kfold.split(X):
        x_train, x_test = X[train], X[test]
        y_train, y_test = Y[train], Y[test]
        
        scaler = StandardScaler()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)
        
        model = SVC(random_state=27)
        model.fit(x_train, y_train)
        preds = model.predict(x_test)
        score = accuracy_score(y_test, preds)
        scores.append(score)
    return sum(scores)/len(scores)

## Check score with DecisionTree

In [45]:
# 5 Fold cross validation
kfold = KFold(n_splits=5, random_state=27, shuffle=True)
# A variable to log all the scores
scores = list()
# Run CV
for index, (train, test) in enumerate(kfold.split(data)):
    # Get the data
    x_train, x_test = data.iloc[train], data.iloc[test]
    y_train, y_test = Y[train], Y[test]
    
    # Scale the input and output.
    # Predictions will be made on down-scaled output and then upscaled for metric calculation
    scaler = StandardScaler()
    x_train = pd.DataFrame(scaler.fit_transform(x_train.values), columns=x_train.columns, index=x_train.index)
    x_test = pd.DataFrame(scaler.transform(x_test.values), columns=x_test.columns, index=x_test.index)
    y_train = np.log(y_train)
    
    # Create an object of the proposed algorithm. We can use any model here
    model = PPIMBR(model=LGBMRegressor(random_state=27), p_val_thresh=0.05, verbose=0, random_state=27, num_simul=50, cv=0, simul_size=0.1, sig_test_type="non-parametric")
    # Let the algorithm find the best features. This is done only on the train fold
    x_train = model.fit_transform(x_train, y_train)
    # Prune the dataset in feature space
    x_test = model.transform(x_test)
    # Check out the size of the markov blanket (optimal set of features)
    print("Markov Blanket: ", len(model.MB))
    x_train, x_test = x_train.values, x_test.values

    # Now use the features to train a decision tree
    model = DecisionTreeRegressor(random_state=27)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    score = np.sqrt(mean_squared_error(y_test, np.exp(preds)))
    scores.append(score)
    print("Score: ", score)
print("\n\nAverage Prediction Error: ", sum(scores)/len(scores))

 4
Score:  3262.277885159386
 4
Score:  3842.0535931321924
 5
Score:  2347.015551716689
 3
Score:  1919.2361761909374
 4
Score:  2292.313882848507


Average Prediction Error:  2732.579417809543


In [5]:
# Average Prediction Error:  2732.579417809543