In [1]:
%load_ext autoreload

In [2]:
 %autoreload 2

In [15]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from models import MetaRegression, BSS, SVDPortfolio
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.svd.html
from scipy.linalg import svd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.base import clone
from statistics import *

In [None]:
def VBS(performance, X_test):
    """ chooses the best solver for each dataset"""
    best_perf = performance.idxmax(axis="columns")
    return best_perf.loc[best_perf.index.isin(X_test.index)]

# functie om van predictions de accuracies te berekenen
def avg_accuracy(predicted_solvers, performance):
    """ Calculates the average accuracy on dataset given the solvers for each dataset.
    input:
        predicted_solvers: dataframe with each row a dataset and predicted solver in column
        performance: dataframe with each row a dataset and in each column the accuracy of a solver
    return:
        float avg accuracy"""
    return np.mean(performance.lookup(predicted_solvers.index, predicted_solvers.values))


In [None]:
from sklearn.feature_selection import chi2
from sklearn import preprocessing

def find_largest_drop(df):
    """Find the biggest drop, returns cutoff value indicating
    where the array should be cut off. Used for Chi squared filtering"""
    drop = 0
    cutoff_value = 0
    names = df.index.names
    for i in range(df.shape[0] - 1):
        if df.iloc[i][0] - df.iloc[i+1][0] > drop:
            cutoff_value = i
            drop = df.iloc[i][0] - df.iloc[i+1][0]
            
    return cutoff_value


def feature_filtering(X_train, X_test, y_train, model):
    """Returns X_train and X_test filtered using chi squared filtering """
    model.fit(X_train, y_train)
    predicted = model.predict(X_test)

    # scale all values of features to range 0-1 because chi squared 
    # cannot handle negative values
    min_max_scaler = preprocessing.MinMaxScaler()

    #get feature names
    chi2_val, pval = chi2(pd.DataFrame(min_max_scaler.fit_transform(X_test.values)), predicted)
    
    #transform numpy array to dataframe to easily get the feature names
    df = pd.DataFrame(data=pval, index=X_test.columns)    
    #sort dataframe
    df.sort_values(by=0, ascending=False, inplace=True)
    
    cutoff_value = find_largest_drop(df)
    features = df.iloc[0:(cutoff_value+1)].index
    
    X_train_filtered = X_train[features]
    X_test_filtered = X_test[features]
    return  X_train_filtered,  X_test_filtered


In [6]:
def crossval10fold(X, y, model, filtering=False):
    kfold = KFold(n_splits=10)
    accuracy = []
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X.iloc[train_index], metafeatures.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        if filtering:
            X_train, X_test = feature_filtering(X_train, X_test, y_train, model)
        model.fit(X_train, y_train)
        accuracy.append(avg_accuracy(model.predict(X_test), y))
    return mean(accuracy)

In [7]:
metadata = pd.read_csv("metadata.csv",usecols=["task_id","flow_id","value"]).set_index("task_id")
metadata = metadata.pivot_table('value', ['task_id'],'flow_id')
data= arff.loadarff('metafeatures-CC18.arff')
metafeatures = pd.DataFrame(data[0]).set_index("task_id")
metafeatures = metafeatures.loc[metafeatures.index.isin(metadata.index)]

In [8]:
regression_models = {
    "Tree":DecisionTreeRegressor(random_state=0),
    "Linear":LinearRegression(),
    "SVM":SVR(),
    "KNN":KNeighborsRegressor(),
    "Forest-500":RandomForestRegressor(n_estimators=500)
}

In [81]:
kfold = KFold(n_splits=10)
solver_list = metadata.columns.to_list()

print("BSS: ", crossval10fold(metafeatures,metadata,BSS(solver_list)))

for model in regression_models:
    print(model, ": ", crossval10fold(metafeatures,metadata,MetaRegression(solver_list,clone(regression_models[model]))))

vbs_accuracy=[]
for train_index, test_index in kfold.split(metafeatures):
    X_train, X_test = metafeatures.iloc[train_index], metafeatures.iloc[test_index]
    vbs_accuracy.append(avg_accuracy(VBS(metadata,X_test), metadata))
print("VBS: ", mean(vbs_accuracy))

# hmm bss is niet de slechtste, dat is vreemd

BSS:  0.8600244008716666
Tree :  0.84309526655
Linear :  0.8144249023483333
SVM :  0.8585198156566667
KNN :  0.85774618276
Forest-500 :  0.843052682075
VBS:  0.87159010936


<h2> Feature filtering </h2>

In [14]:
print("Retraining of models using chi squared filtering")
for model in regression_models:

    print(model, ": ", crossval10fold(metafeatures,metadata,MetaRegression(solver_list,clone(regression_models[model])), True))


Retraining of models using chi squared filtering


NameError: name 'mean' is not defined

<h2> Singular value decomposition </h2>

In [None]:
def latent_crossval(X,model,filtering=0):
    accuracy = []
    for i in range(10):
        X_train = X.sample(frac = 0.9)
        U, s, Vh = svd(X_train)
        df_U = pd.DataFrame(index = X_train.index)
        #m > n dus alleen eerste n cols zijn nodig
        U = U[:,:X_train.shape[1]-0]
        for i in range(U.shape[1]):
            df_U[i] = U[:,i]
        X_test = X.dot(Vh.T).dot(np.linalg.inv(np.diag(s)))

        y_train, y_test = X.loc[df_U.index,:], X.loc[df_U.index,:]
        model.fit(X_train, y_train)
        accuracy.append(avg_accuracy(model.predict(X_test), X))
    return mean(accuracy)

<h2> SVD feature filtering </h2>

In [74]:
# feature filtering with SVD
print("Total number of latent features: ", metadata.shape[1])

for i in range(9):
    print("Accuracy results after filtering out ",i," latent feature(s)")
    print("Tree: ", latent_crossval(metadata,MetaRegression(solver_list,DecisionTreeRegressor(random_state=0)),i))
    print("Linear: ", latent_crossval(metadata,MetaRegression(solver_list,LinearRegression()),i))
    print("SVM: ", latent_crossval(metadata,MetaRegression(solver_list,SVR())))
    print("KNN: ", latent_crossval(metadata,MetaRegression(solver_list,KNeighborsRegressor()),i))
    print("Forest-500: ", latent_crossval(metadata,MetaRegression(solver_list,RandomForestRegressor(n_estimators=500)),i))
    print("")




Total number of latent features:  10
Accuracy results after filtering out  0  latent feature(s)
Tree:  0.7234742424976667
Linear:  0.8112901873168333
SVM:  0.8597770403643332
KNN:  0.8534350029233334
Forest-500:  0.8473332460838333
Accuracy results after filtering out  1  latent feature(s)
Tree:  0.7237499172796666
Linear:  0.8336603551245
SVM:  0.8593654610768332
KNN:  0.8524398226028334
Forest-500:  0.8406587364626666
Accuracy results after filtering out  2  latent feature(s)
Tree:  0.7373680603358334
Linear:  0.8210012501315
SVM:  0.8598104410166666
KNN:  0.8534350029233334
Forest-500:  0.8423247914655
Accuracy results after filtering out  3  latent feature(s)
Tree:  0.724755351571
Linear:  0.8245324341415
SVM:  0.8590092067611665
KNN:  0.8534350029233334
Forest-500:  0.840630959301
Accuracy results after filtering out  4  latent feature(s)
Tree:  0.7362343988765
Linear:  0.8187939527221667
SVM:  0.8599196800064999
KNN:  0.8534350029233334
Forest-500:  0.8488456072781667
Accuracy re

<h2> Estimating latent features </h2>

In [49]:
# U& Vh unitary matrices, s is 1d array met positieve scalars
# a == U @ S@ Vh met S matrix met nullen en s op diag

U, s, Vh = svd(metadata)
df_U = pd.DataFrame(index = metadata.index)
#m > n dus alleen eerste n cols zijn nodig
for i in range(metadata.shape[1]):
    df_U[i] = U[:,i]

print("Latent feature estimation with regression R^2 scores")

cv = KFold(n_splits=10)

print("Tree: ", mean(cross_val_score(DecisionTreeRegressor(random_state=0), metafeatures, df_U, cv=cv, n_jobs=1)))

print("Linear: ", mean(cross_val_score(LinearRegression(), metafeatures, df_U, cv=cv, n_jobs=1)))

print("SVM: ", mean(cross_val_score(SVR(), metafeatures.to_numpy(), df_U.to_numpy(), cv=cv, n_jobs=1,scoring="r2")))

print("KNN: ", mean(cross_val_score(KNeighborsRegressor(), metafeatures.to_numpy(), df_U.to_numpy(), cv=cv, n_jobs=1)))

print("Forest-500: ", mean(cross_val_score(RandomForestRegressor(n_estimators=500), metadata, df_U, cv=cv, n_jobs=1)))

Latent feature estimation with regression crossval scores
Tree:  -2.9955405689906516
Linear:  -262991.03787518444
SVM:  nan
KNN:  -1.0031830926869196
Forest-500:  -0.5114764117777918


In [19]:

print("SVDportfolio, no filtering: ", crossval10fold(metafeatures,metadata, SVDPortfolio(metadata.columns.to_list(), LinearRegression(),RandomForestRegressor(n_estimators=500))))


SVDportfolio, no filtering:  0.8629734388166667
