In [9]:
import numpy as np
import itertools
from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn.gaussian_process.kernels import RBF, Sum, ConstantKernel, DotProduct, ExpSineSquared, Matern, PairwiseKernel, RationalQuadratic, RBF, WhiteKernel
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
import union

<h2> All-purpose </h2>

In [24]:
def display_scores(scores:list):
    """
    Displays scores, mean, and std.
    """
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [7]:
def attributes(attribute:dict={}) -> list:
    """
    Returns list of dictionaries - different combinations of hyperparameters values ready to put in a model.
    
    input f.e.:
    attribute = {'loss': ['ls', 'lad', 'huber', 'quantile'],
     'criterion': ['friedman_mse', 'squared_error', 'mse'],
     'max_features': [None, 'auto', 'sqrt', 'log2']
                }
    """
    
    attribute_names = []
    attribute_values = []
    final_attr_list = []
    
    for k,v in sorted(attribute.items()):
        attribute_names.append(k)
        attribute_values.append(v)
    
    for attr_composition in itertools.product(*attribute_values):
        new_dic = {}
        for i in range(len(attribute_names)):
            new_dic[attribute_names[i]] = attr_composition[i]
        final_attr_list.append(new_dic)
        
    return final_attr_list

<h2> Regression </h2>

In [6]:
def regression_randomized_tuning(X:np.ndarray, y:np.ndarray, distributions:dict, model, n_iter:int=1, r_state:int=None, cv:int=4) -> [float, float, dict]:
    """
    Hyperparameter tuning for regression model.
    """
    reg = RandomizedSearchCV(model, distributions, n_iter=n_iter, random_state=r_state)

    reg.fit(X, y)

    predictions = reg.predict(X)
    train_mse = mean_squared_error(predictions, y)
    rmse_training = np.sqrt(train_mse)

    scores = cross_val_score(reg, X, np.ravel(y),
                                  scoring="neg_mean_squared_error", cv=cv)

    rmse_cv = np.sqrt(-scores).mean()
    
    return rmse_training, rmse_cv, reg.best_params_

In [None]:
def regression_linear_models(train_set_ready:np.ndarray, train_set_labels:np.ndarray, cv:int=4):
    """
    Evaluation of linear models with some basic settings for training and cv set.
    """
    
    ridge_attrubites = {'solver': ['svd', 'lsqr', 'sag', 'cholesky', 'sparse_cg', 'saga']}
    
    sgd_attributes = {'penalty': ['l1', 'l2', 'elasticnet'],
                      'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive']}
    
    models = {'LinearRegression': {'attr': [{}]},
               'Ridge': {'attr': attributes(ridge_attrubites)},
               'Lasso': {'attr': [{'selection': 'cyclic'}, {'selection': 'random'}]},
               'ElasticNet': {'attr': [{'selection': 'cyclic'}, {'selection': 'random'}]},
               'Lars': {'attr': [{}]},
               'LassoLars': {'attr': [{}]},
               'OrthogonalMatchingPursuit': {'attr': [{}]},
               'BayesianRidge': {'attr': [{}]},
               'ARDRegression': {'attr': [{}]},
               'LogisticRegression': {'attr': [{'penalty': 'l1', 'solver': 'liblinear'},
                                              {'penalty': 'l1', 'solver': 'saga'},
                                              {'penalty': 'l2', 'solver': 'newton-cg'},
                                              {'penalty': 'l2', 'solver': 'lbfgs'},
                                              {'penalty': 'l2', 'solver': 'liblinear'},
                                              {'penalty': 'l2', 'solver': 'sag'},
                                              {'penalty': 'l2', 'solver': 'saga'},
                                              {'penalty': 'elasticnet', 'solver': 'saga', 'l1_ratio':0.5},
                                              {'penalty': 'none', 'solver': 'newton-cg'},
                                              {'penalty': 'none', 'solver': 'lbfgs'},
                                              {'penalty': 'none', 'solver': 'sag'},
                                              {'penalty': 'none', 'solver': 'saga'}]},
               'SGDRegressor': {'attr': attributes(sgd_attributes)},
               'PassiveAggressiveRegressor': {'attr': [{'loss': 'epsilon_insensitive'},
                                                       {'loss': 'squared_epsilon_insensitive'}]},
               'HuberRegressor': {'attr': [{}]},
               'TweedieRegressor': {'attr': [{'link': 'identity'}, {'link': 'log'}]},
               'TheilSenRegressor': {'attr': [{}]},
               'RANSACRegressor': {'attr': [{}]}
              }

    final_models = {}
    
    train_set_labels = np.ravel(train_set_labels)
    for k,v in sorted(models.items()):
        print(k,': ')
        print(v['attr'])
        for i in range(len(v['attr'])):
            fun = getattr(linear_model, k)
            reg = fun(**v['attr'][i])
            reg.fit(train_set_ready, train_set_labels)

            predictions = reg.predict(train_set_ready)

            train_mse = mean_squared_error(predictions, train_set_labels)
            rmse_training = np.sqrt(train_mse)

            scores = cross_val_score(reg, train_set_new_ready, np.ravel(train_set_labels),
                                          scoring="neg_mean_squared_error", cv=cv)

            rmse_cv = np.sqrt(-scores).mean()
            
            model_name = str(k) + str(v['attr'][i])
            
            final_models[model_name] = {'rmse_training': rmse_training, 'rmse_cv': rmse_cv}
        
    return final_models

In [12]:
def regression_kernelridge(train_set_ready:np.ndarray, train_set_labels:np.ndarray, cv:int=4):
    """
    Evaluation of kernel ridge with some basic settings for training and cv set.
    """
    kernels = [ConstantKernel(), DotProduct(), ExpSineSquared(), Matern(),
              PairwiseKernel(), RationalQuadratic(), RBF(), WhiteKernel(),
              'linear']
    
    final_models = {}
    
    for kernel in kernels:

        reg = KernelRidge(kernel=kernel)
        reg.fit(train_set_ready, train_set_labels)

        predictions = reg.predict(train_set_new_ready)

        reg_mse = mean_squared_error(predictions, train_set_labels)
        rmse_training = np.sqrt(reg_mse)

        reg_scores = cross_val_score(reg, train_set_new_ready, train_set_labels,
                                     scoring="neg_mean_squared_error", cv=cv)

        rmse_cv = np.sqrt(-reg_scores).mean()
        
        final_models['KernelRidge ' + str(kernel)] = {'rmse_training': rmse_training, 'rmse_cv': rmse_cv}
        
    return final_models

In [13]:
def regression_svm(train_set_ready:np.ndarray, train_set_labels:np.ndarray, cv:int=4):
    """
    Evaluation of svm models with some basic settings for training and cv set.
    """
    
    svr_attributes = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
    
    nusvr_attributes = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
    
    models = {'SVR': {'attr': attributes(svr_attributes)},
              'NuSVR': {'attr': attributes(nusvr_attributes)},
              'LinearSVR': {'attr': [{'loss': 'epsilon_insensitive'},
                                     {'loss': 'squared_epsilon_insensitive'}]},
             }
    
    final_models = {}
    
    train_set_labels = np.ravel(train_set_labels)
    for k,v in sorted(models.items()):
        print(k,': ')
        print(v['attr'])
        for i in range(len(v['attr'])):
            fun = getattr(svm, k)
            reg = fun(**v['attr'][i])
            reg.fit(train_set_ready, train_set_labels)

            predictions = reg.predict(train_set_ready)

            train_mse = mean_squared_error(predictions, train_set_labels)
            rmse_training = np.sqrt(train_mse)

            scores = cross_val_score(reg, train_set_new_ready, np.ravel(train_set_labels),
                                          scoring="neg_mean_squared_error", cv=cv)

            rmse_cv = np.sqrt(-scores).mean()
            
            model_name = str(k) + str(v['attr'][i])
            
            final_models[model_name] = {'rmse_training': rmse_training, 'rmse_cv': rmse_cv}
        
    return final_models

In [14]:
def regression_gaussianprocess(train_set_ready:np.ndarray, train_set_labels:np.ndarray, cv:int=4):
    """
    Evaluation of gaussian process model with some basic settings for training and cv set.
    """
    
    kernels = [ConstantKernel(), DotProduct(), ExpSineSquared(), Matern(),
              PairwiseKernel(), RationalQuadratic(), RBF(), WhiteKernel()]
    
    final_models = {}
    
    for kernel in kernels:
        try:
            reg = GaussianProcessRegressor(kernel=kernel)
            reg.fit(train_set_ready, train_set_labels)

            predictions = reg.predict(train_set_new_ready)

            reg_mse = mean_squared_error(predictions, train_set_labels)
            rmse_training = np.sqrt(reg_mse)

            reg_scores = cross_val_score(reg, train_set_new_ready, train_set_labels,
                                         scoring="neg_mean_squared_error", cv=cv)

            rmse_cv = np.sqrt(-reg_scores).mean()

            final_models['GaussianProcessRegressor ' + str(kernel)] = {'rmse_training': rmse_training, 'rmse_cv': rmse_cv}
        except Exception as e: print(e)
            
    return final_models

In [None]:
def regression_crossdecomposition(train_set_ready, train_set_labels, cv=4):
    """
    Evaluation of cross decomposition models with some basic settings for training and cv set.
    """
    
    models = {'PLSRegression': {'attr': [{}]},
              'PLSCanonical': {'attr': [{'algorithm': 'nipals'},
                                        {'algorithm': 'svd'}]},
              'CCA': {'attr': [{}]}
             }
    
    final_models = {}
    
    train_set_labels = np.ravel(train_set_labels)
    for k,v in sorted(models.items()):
        print(k,': ')
        print(v['attr'])
        for i in range(len(v['attr'])):
            fun = getattr(cross_decomposition, k)
            reg = fun(**v['attr'][i])
            reg.fit(train_set_ready, train_set_labels)

            predictions = reg.predict(train_set_ready)

            train_mse = mean_squared_error(predictions, train_set_labels)
            rmse_training = np.sqrt(train_mse)

            scores = cross_val_score(reg, train_set_new_ready, np.ravel(train_set_labels),
                                          scoring="neg_mean_squared_error", cv=cv)

            rmse_cv = np.sqrt(-scores).mean()
            
            model_name = str(k) + str(v['attr'][i])
            
            final_models[model_name] = {'rmse_training': rmse_training, 'rmse_cv': rmse_cv}
        
    return final_models

In [15]:
def regression_decisiontree(train_set_ready:np.ndarray, train_set_labels:np.ndarray, cv:int=4):
    """
    Evaluation of decision tree model with some basic settings for training and cv set.
    """
    
    criterion = ['mse', 'friedman_mse', 'mae', 'poisson']
    splitter = ['best','random']
    max_features = [None, 'auto', 'sqrt', 'log2']
    
    attributes = []
    
    for i in range(len(criterion)):
        for j in range(len(splitter)):
            for k in range(len(max_features)):
                new_dic = {'criterion': criterion[i], 'splitter': splitter[j],
                            'max_features': max_features[k]}
                
                attributes.append(new_dic)      
                
    final_models = {}
    
    for attr in attributes:

        reg = tree.DecisionTreeRegressor(**attr)
        reg.fit(train_set_ready, train_set_labels)

        predictions = reg.predict(train_set_new_ready)

        reg_mse = mean_squared_error(predictions, train_set_labels)
        rmse_training = np.sqrt(reg_mse)

        reg_scores = cross_val_score(reg, train_set_new_ready, train_set_labels,
                                     scoring="neg_mean_squared_error", cv=cv)

        rmse_cv = np.sqrt(-reg_scores).mean()
        
        final_models['DecisionTreeRegressor ' + str(attr)] = {'rmse_training': rmse_training, 'rmse_cv': rmse_cv}
        
    return final_models

In [16]:
def regression_ensemble(train_set_ready:np.ndarray, train_set_labels:np.ndarray, cv:int=4):
    """
    Evaluation of ensemble models with some basic settings for training and cv set.
    """
    
    estimators = [('ridge', RidgeCV()),
          ('lasso', LassoCV(random_state=42)),
          ('knr', KNeighborsRegressor(n_neighbors=20,
                                      metric='euclidean'))]
    
    abr_attributes = {'loss': ['linear', 'square', 'exponential']}
    
    etr_attrubites = {'criterion': ['mse', 'mae'],
                      'max_features': ['sqrt', 'log2', None, 1]}
    
    gbr_attributes = {'loss': ['ls', 'lad', 'huber', 'quantile'],
     'criterion': ['friedman_mse', 'mse'],
     'max_features': [None, 'auto', 'sqrt', 'log2']}
    
    rf_attributes = {'criterion': ['mse', 'mae', 'poisson'],
                      'max_features': ['sqrt', 'log2', None, 1]}   
    
    models = {'AdaBoostRegressor': {'attr': attributes(abr_attributes)},
              'BaggingRegressor': {'attr': [{}]},
              'ExtraTreesRegressor': {'attr': attributes(etr_attrubites)},
              'GradientBoostingRegressor': {'attr': attributes(gbr_attributes)},
              'IsolationForest': {'attr': [{}]},
              'RandomForestRegressor': {'attr': attributes(rf_attributes)},
              'VotingRegressor': {'attr': [{'estimators': estimators}]},
              'StackingRegressor': {'attr': [{'estimators': estimators}]}
              }
    
    final_models = {}
    
    train_set_labels = np.ravel(train_set_labels)
    for k,v in sorted(models.items()):
        print(k,': ')
        print(v['attr'])
        for i in range(len(v['attr'])):
            fun = getattr(ensemble, k)
            reg = fun(**v['attr'][i])
            reg.fit(train_set_ready, train_set_labels)

            predictions = reg.predict(train_set_ready)

            train_mse = mean_squared_error(predictions, train_set_labels)
            rmse_training = np.sqrt(train_mse)

            scores = cross_val_score(reg, train_set_new_ready, np.ravel(train_set_labels),
                                          scoring="neg_mean_squared_error", cv=cv)

            rmse_cv = np.sqrt(-scores).mean()
            
            model_name = str(k) + str(v['attr'][i])
            
            final_models[model_name] = {'rmse_training': rmse_training, 'rmse_cv': rmse_cv}
        
    return final_models

In [17]:
def regression_neuralnetwork(train_set_ready:np.ndarray, train_set_labels:np.ndarray, cv:int=4):
    """
    Evaluation of neural network with some basic settings for training and cv set.
    """
    
    nn_attributes = {'activation': ['identity', 'logistic', 'relu', 'tanh'],
     'solver': ['lbfgs', 'sgd', 'adam'],
     'learning_rate': ['constant', 'invscaling', 'adaptive']}

    attributes_list = attributes(nn_attributes)
                
    final_models = {}
    
    for attr in attributes_list:

        reg = MLPRegressor(**attr)
        reg.fit(train_set_ready, train_set_labels)

        predictions = reg.predict(train_set_new_ready)

        reg_mse = mean_squared_error(predictions, train_set_labels)
        rmse_training = np.sqrt(reg_mse)

        reg_scores = cross_val_score(reg, train_set_new_ready, train_set_labels,
                                     scoring="neg_mean_squared_error", cv=cv)

        rmse_cv = np.sqrt(-reg_scores).mean()
        
        final_models['MLPRegressor ' + str(attr)] = {'rmse_training': rmse_training, 'rmse_cv': rmse_cv}
        
    return final_models

<h2> Classification </h2>

In [1]:
def pixels_to_image(x:np.ndarray, im_height:int, im_width:int, labels:dict=None, y:np.ndarray=None):
    """
    Displays image from a given row of pixels.
    """
    
    image = x.reshape(im_height, im_width)

    plt.imshow(image, cmap=mpl.cm.binary, interpolation='nearest')
    plt.axis('off')
    plt.show()
    
    if not (labels is None or y is None):
        print(labels[y])

In [18]:
def plot_image_class(instances:np.ndarray, images_per_row:int, im_height:int, im_width:int, **options):
    """
    Plots a grid of images for given instances.
    """
    
    images_per_row = min(len(instances), images_per_row)

    n_rows = (len(instances) - 1) // images_per_row + 1 # compute number of rows using floor division

    n_empty = n_rows * images_per_row - len(instances)
    padded_instances = np.concatenate([instances, np.zeros((n_empty, im_height * im_width))], axis=0)

    image_grid = padded_instances.reshape((n_rows, images_per_row, im_height, im_width))

    big_image = image_grid.transpose(0, 2, 1, 3).reshape(n_rows * im_height,
                                                         images_per_row * im_width)

    plt.imshow(big_image, cmap = mpl.cm.binary, **options)
    plt.axis("off")

In [23]:
def my_cross_val_score(X_train:np.ndarray, y_train:np.ndarray, model, n_splits:int, r_state:int=42):
    """
    Use to have more control in a CV than in sklearn's counterpart.
    """
    
    skfolds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=r_state)
    
    for train_index, test_index in skfolds.split(X_train, y_train):
        
        clone_model = clone(model)
        
        X_train_folds = X_train[train_index]
        y_train_folds = y_train[train_index]
        
        X_test_fold = X_train[test_index]
        y_test_fold = y_train[test_index]
        
        clone_model.fit(X_train_folds, y_train_folds)
        
        y_pred = clone_model.predict(X_test_fold)
        n_correct = sum(y_pred == y_test_fold)
        
        print(n_correct / len(y_pred))         

In [25]:
def plot_precision_recall_vs_threshold(precisions:list, recalls:list, thresholds:list):
    """
    Precision-recall vs threshold plot.
    """
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.legend(loc="center right", fontsize=16) 
    plt.xlabel("Threshold", fontsize=16)        
    plt.grid(True)                              
    plt.axis([-50000, 50000, 0, 1])             

In [21]:
def plot_precision_vs_recall(precisions:list, recalls:list):
    """
    Precision vs recall plot.
    """
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])
    plt.grid(True)

In [22]:
def plot_roc_curve(fpr:list, tpr:list, label=None):
    """
    Roc-curve plot.
    """
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
    plt.axis([0, 1, 0, 1])                                    
    plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) 
    plt.ylabel('True Positive Rate (Recall)', fontsize=16)    
    plt.grid(True)                                            