# K-fold Cross-Validation and Validation Curve

*K-fold Cross-Validation* algorithm built from scratch under the form of the `ValidationCurve()` Python class, used to perform Hyperparameter and Principal Components tuning.

In [None]:
# Validation curve with k-fold Cross-Validation algorithm from scratch

class ValidationCurve(): 
      
    # Create the ValidationCurve object
    def __init__(self, features, labels, feat_scaling, include_intercept = True): 
        
        self.features = features
        self.labels = labels
        self.feat_scaling = feat_scaling
        self.include_intercept = include_intercept
        
    # Display the Validation Curve for the Hyperparameter Tuning of the Ridge learner
    def val_curve_ridge(self, grid_alpha, k_folds, seed, show_plot = True):
        
        # Arrays for tracking Training and Validation errors
        train_errors = np.zeros((len(grid_alpha), k_folds))
        cross_val_errors = np.zeros((len(grid_alpha), k_folds))
        
        # Random shuffle of the Features and Labels
        idx = np.random.RandomState(seed = seed).permutation(len(self.features))
        feat_shuff, labl_shuff = self.features.loc[idx, :], self.labels.loc[idx, :]
        
        # Features and Labels divided in k-number of folds
        feat_folds, labl_folds = np.array_split(feat_shuff, k_folds), np.array_split(labl_shuff, k_folds)
    
        # Loop on the values of the Hyperparameter in the grid
        for alp in range(0, len(grid_alpha)):
            
            # Loop over the Training and Validation folds
            for i in range(0, k_folds):
            
                ridge_learner = Ridge(alpha = grid_alpha[alp], intercept = self.include_intercept)
                        
                # Features and Labels in the Validation fold at each iteration
                f_val = feat_folds[i]
                l_val = labl_folds[i]
            
                # Features and Labels in the Training folds at each iteration
                f_train = feat_shuff.drop(f_val.index)
                l_train = labl_shuff.drop(l_val.index)
                
                # Transform the Features without breaking the independence between Training and Validation folds
                if self.feat_scaling is not None:
                    transformer = features_transformation(self.feat_scaling)
                    f_train = transformer.fit(f_train)
                    f_val = transformer.test_transform(f_val)
            
            
                # Train the predictor on the Training folds
                ridge_learner.fit(f_train, l_train)
            
                # Compute the Validation Error (estimate of the risk)  according to the square loss                            
                train_pred = ridge_learner.predict(f_train)
                train_errors[alp, i] = np.mean((train_pred - l_train)**2)
        
                ridge_learner.predict(f_val)
        
                cross_val_errors[alp, i] = ridge_learner.pred_err(l_val)
                
        # Compute the average of the Validation Errors for each value of the Hyperparameter in the grid
        val_score_mean = np.mean(cross_val_errors, axis=1)
        
        # Retrieve the best value of the Hyperparameter that lead to the lowest Risk Estimate
        best_ind = np.where(val_score_mean == np.min(val_score_mean))
        best_alpha = grid_alpha[best_ind]

        self.best_alpha = best_alpha[0]
        
        train_score_mean = np.mean(train_errors, axis=1)
        train_score_std = np.std(train_errors, axis=1)
        val_score_std = np.std(cross_val_errors, axis=1)   
            
        # Plot the Validation Curve
        if show_plot == True:
            
            plt.figure(figsize=(12,7))
            plt.plot(grid_alpha, val_score_mean, 'o-', label='CV error estimate')
            plt.plot(grid_alpha, train_score_mean, 'o-', label='Training error')
            
            plt.fill_between(grid_alpha, val_score_mean - val_score_std, 
                 val_score_mean + val_score_std, alpha=0.1, color='r', label = 'CV estimate std. deviation')
            plt.fill_between(grid_alpha, train_score_mean - train_score_std, 
                train_score_mean + train_score_std, alpha=0.1, color='g', label = 'Train error std. deviation')
        
            plt.legend(loc = 'upper left')
            #plt.title('Validation Curve - ' + str(self.feat_scaling))
            plt.xlabel('alpha')
            plt.ylabel('Average square loss')
            plt.annotate('min.', xy=(grid_alpha[best_ind], val_score_mean[best_ind]), 
                xytext = (grid_alpha[best_ind], val_score_mean[best_ind]+0.01*val_score_mean[best_ind]),
                         arrowprops=dict(arrowstyle="->"))
            plt.show()
            
            # Print a dictionary with teh Best Hyperparameter and the corresponding Risk Estimate
            dict_cv = {'Best Hyperparameter': self.best_alpha, 
                       'Cross-Validation Risk Estimate': np.format_float_scientific(np.min(val_score_mean))}
            self.dict_cv = dict_cv
            print(dict_cv)
            
        return(train_errors, cross_val_errors)
        
        
    # Select the best number of Principal Components according to the corresponding Risk Estimates
    def val_curve_pca(self, grid_princomp, k_pcafolds, seed, show_plot = True):
        
        # Track the Training and Validation errors for each number of Principal Components
        train_pca_curve_err = np.zeros((len(grid_princomp), k_pcafolds))
        val_pca_curve_err = np.zeros((len(grid_princomp), k_pcafolds))
        
        # Random shuffle of the Features and Labels
        idx_pca = np.random.RandomState(seed = seed).permutation(len(self.features))
        feat_shuff_pca, labl_shuff_pca = self.features.loc[idx_pca, :], self.labels.loc[idx_pca, :]
    
        # Features and Labels divided in k-number of folds
        feat_folds_pca, labl_folds_pca = np.array_split(feat_shuff_pca, k_pcafolds), np.array_split(labl_shuff_pca, k_pcafolds)
    
        # Loop on the number of Principal Components in the grid
        for pc in range(0, len(grid_princomp)):
        
            # Loop over the Training and Validation folds
            for i in range(0, k_pcafolds):
            
                ridge_learner = Ridge(alpha = self.best_alpha, intercept = self.include_intercept)
                
                # Features and Labels in the Validation fold at each iteration
                f_val = feat_folds_pca[i]
                l_val = labl_folds_pca[i]
                
                # Features and Labels in the Training folds at each iteration
                f_train = feat_shuff_pca.drop(f_val.index)
                l_train = labl_shuff_pca.drop(l_val.index)
                
                # Transform the Features without breaking the independence between the Training and Validation folds
                if self.feat_scaling is not None:
                    transformer = features_transformation(self.feat_scaling)
                    f_train = transformer.fit(f_train)
                    f_val = transformer.test_transform(f_val)
                
                # Perform PCA on the Training folds and project also the observations in the Validation fold
                pca = PCA(f_train)
                pca.singular_values()
                
                # Rewrite the Train and Validation Features with the lower dimensional ones
                f_train = pca.projected_features(grid_princomp[pc], False)
                f_val = pca.project_test(f_val)
                
                # Train the algorithm on the Training folds 
                ridge_learner.fit(f_train, l_train)
                                               
                train_pred = ridge_learner.predict(f_train)
                train_pca_curve_err[pc, i] = np.mean((train_pred - l_train)**2)
                                
                # Predict the Labels in the Validation fold and compute the Validation error
                ridge_learner.predict(f_val)
                val_pca_curve_err[pc, i] = ridge_learner.pred_err(l_val)
                
        
        # Compute the average Training and Validation Errors for each number of Principal Components
        train_score_pca_mean = np.mean(train_pca_curve_err, axis = 1)
        val_score_pca_mean = np.mean(val_pca_curve_err, axis = 1)
            
        # Retrieve the best number of Principal Components that lead to the lowest Risk Estimate
        best_ind = np.where(val_score_pca_mean == np.min(val_score_pca_mean))
        best_npc = grid_princomp[best_ind]

        self.best_npc = best_npc[0]
    
        #train_score_pca_std = np.std(train_pca_curve_err, axis=1)                
        #val_score_pca_std = np.std(val_pca_curve_err, axis=1)
            
        # Plot the new version of the Validation Curve
        if show_plot == True:
            
            plt.figure(figsize=(12,7))
            plt.plot(grid_princomp, val_score_pca_mean, 'o-', label='CV error estimate')
            plt.plot(grid_princomp, train_score_pca_mean, 'o-', label='Training error')
            
            #plt.fill_between(grid_princomp, val_score_pca_mean - val_score_pca_std,                  
            #    val_score_pca_mean + val_score_pca_std, alpha=0.1,color='r', label = 'CV estimate std. deviation')
            #plt.fill_between(grid_princomp, train_score_pca_mean - train_score_pca_std, 
            #    train_score_pca_mean + train_score_pca_std,alpha=0.1,color='g',label ='Train error std. deviation')
        
            plt.legend(loc = 'upper left')                
            #plt.title('Validation Curve - PCA')
            plt.xlabel('principal components')
            plt.ylabel('average square loss')
            plt.annotate('min.', xy=(grid_princomp[best_ind], val_score_pca_mean[best_ind]), 
                xytext = (grid_princomp[best_ind], 2.5*val_score_pca_mean[best_ind]), 
                         arrowprops=dict(arrowstyle="->"))
            plt.show()
            
            dict_cv_pca = {'Best Number of Principal Components': self.best_npc, 
                           'Cross-Validation Risk Estimate': np.format_float_scientific(np.min(val_score_pca_mean))}
            self.dict_cv_pca = dict_cv_pca
            print(dict_cv_pca)
    
        return(train_pca_curve_err, val_pca_curve_err)