In [1]:
import sys
import os
import pandas as pd
import warnings
import numpy as np
from datetime import date
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
sys.path.append(os.path.abspath("/projects/genomic-ml/da2343/ml_project_1/shared"))
from model_header import *
from constants import *

warnings.filterwarnings('ignore')
np.set_printoptions(threshold=np.inf)

params_df = pd.read_csv("params.csv")

if len(sys.argv) == 2:
    prog_name, task_str = sys.argv
    param_row = int(task_str)
else:
    print("len(sys.argv)=%d so trying first param" % len(sys.argv))
    param_row = 0

param_dict = dict(params_df.iloc[param_row, :])
data_set_name = param_dict["Dataset"]
n_sub_samples = param_dict["# of Total Samples"]
index_of_pred_col = param_dict["Index of Prediction Col"]




class SpearmanRankRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, threshold=0.0):
        self.threshold = threshold
        self.preprocessor1 = make_pipeline(
            MinMaxScaler(),
            StandardScaler(),
        )
        self.preprocessor2 = make_pipeline(
            MinMaxScaler(),
            StandardScaler(),
        )

    def fit(self, X, y):
        self.y_train = y
        # X_train_ranked_transf = ss.rankdata(X, axis=0)
        self.y_train_ranked_transf = self.preprocessor2.fit_transform(
            ss.rankdata(y).reshape(-1, 1)).flatten()
        # self.y_train_ranked_transf = ss.rankdata(y)
        X_train_ranked_transf = self.preprocessor1.fit_transform(
            ss.rankdata(X, axis=0))
        # X_train_ranked_transf = PowerTransformer().fit_transform(ss.rankdata(X, axis=0))
        # self.y_train_ranked_transf = PowerTransformer().fit_transform(ss.rankdata(y))

        slope_list = []
        intercept_list = []

        for index_col in range(X_train_ranked_transf.shape[1]):
            X_col = X_train_ranked_transf[:, index_col]
            calc_slope, calc_intercept = self.find_model_params(
                X_col, self.y_train_ranked_transf)
            slope_list.append(calc_slope)
            intercept_list.append(calc_intercept)
        # Find the mean of the gradients and intercepts
        self.slope_list = slope_list
        self.intercept_list = intercept_list
        return self

    def find_model_params(self, X_col, y_col):
        calc_cor = np.corrcoef(X_col, y_col)[0, 1]
        # If the correlation is greater than the threshold, then calculate the gradient and intercept
        if abs(calc_cor) > self.threshold:
            calc_slope = calc_cor * np.std(y_col) / np.std(X_col)
            calc_intercept = np.mean(y_col) - calc_slope * np.mean(X_col)
        else:
            calc_slope = None
            calc_intercept = None
        return calc_slope, calc_intercept

    def predict(self, X):
        pred_y_list = []
        X_test_ranked_transf = self.preprocessor1.fit_transform(
            ss.rankdata(X, axis=0))
        # X_test_ranked_transf = ss.rankdata(X, axis=0)

        for index_col in range(X_test_ranked_transf.shape[1]):
            X_col = X_test_ranked_transf[:, index_col]
            # use the average of the slope_list as the default slope
            filtered_slope_list = [x for x in self.slope_list if x is not None]
            mean_filtered_slope = np.mean(filtered_slope_list) if len(
                filtered_slope_list) > 0 else 0
            calc_slope = mean_filtered_slope if self.slope_list[
                index_col] is None else self.slope_list[index_col]

            filtered_intercept_list = [
                x for x in self.intercept_list if x is not None]
            mean_filtered_intercept = np.mean(filtered_intercept_list) if len(
                filtered_intercept_list) > 0 else 0
            calc_intercept = mean_filtered_intercept if self.intercept_list[
                index_col] is None else self.intercept_list[index_col]

            calc_y_ranked = calc_slope * X_col + calc_intercept

            # remove duplicate values from self.y_train_ranked_transf and use indexes to remove items from self.y_train
            y_train_ranked_transf_unique, sorted_indexes = np.unique(
                self.y_train_ranked_transf, return_index=True)
            y_train_unique = self.y_train[sorted_indexes]

            try:
                linear_interpolation = interpolate.interp1d(
                    y_train_ranked_transf_unique, y_train_unique, fill_value="extrapolate")
                calc_y = linear_interpolation(calc_y_ranked)
                if np.isnan(calc_y).any():
                   calc_y = [np.mean(self.y_train)] * len(calc_y_ranked)
            except Exception as e:
                calc_y = [np.mean(self.y_train)] * len(calc_y_ranked)
                print(e)
                
            pred_y_list.append(calc_y)
        # Find the mean of the predicted y values
        pred_y = np.mean(np.array(pred_y_list), axis=0)
        return pred_y





# Name some string contants
out_dir = "/scratch/da2343/cs685fall22/data"
out_file = out_dir + f'/my_algos_{str(date.today())}_results.csv'

dataset_path = dataset_dict[data_set_name]
n_splits = 3
# Import the csv file of the dataset
dataset_pd = pd.read_csv(dataset_path, header=0)


# Create a list of alphas for the LASSOCV to cross-validate against
threshold_param_dict = [{'threshold': [threshold]}
                        for threshold in np.concatenate((np.linspace(0, 0.4, 5), np.linspace(0.41, 0.6, 21), np.arange(0.7, 1.01, 0.1)))]
learner_dict = {
    "Featureless": Featureless(),
    # 'Pearson':  MyPearsonRegressor(),
    'Spearman':  SpearmanRankRegressor(),
    # "LASSO": LassoCV(random_state=1),
    # "GGM": GaussianGraphicalModel(),
}


test_err_list = []

n_sections = int(np.floor(dataset_pd.shape[0] / n_sub_samples))
shuffled_df = dataset_pd.sample(frac=1, random_state=1)
total_samples = n_sub_samples * n_sections
shuffled_df_updated = shuffled_df.iloc[:total_samples, :]
shuffled_arr = np.split(shuffled_df_updated, n_sections)


for ss_index, sub_section in enumerate(shuffled_arr):
    # drop only one column per every iteration to form the input matrix
    # make the column you removed the output
    output_vec = sub_section.iloc[:, index_of_pred_col].to_numpy().ravel()  
    input_mat = sub_section.drop(sub_section.columns[index_of_pred_col], axis=1).to_numpy()  
    
    k_fold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
    for fold_id, indices in enumerate(k_fold.split(input_mat)):
        index_dict = dict(zip(["train", "test"], indices))
        set_data_dict = {}
        for set_name, index_vec in index_dict.items():
            set_data_dict[set_name] = {
                "X": input_mat[index_vec],
                "y": output_vec[index_vec]
            }
        # Loop through the learners
        # Fit the learner to the training data
        # Predict the test data
        # Calculate the test error
        for learner_name, learner in learner_dict.items():
            learner.fit(**set_data_dict["train"])
            pred_y = learner.predict(set_data_dict["test"]["X"])
            actual_y = set_data_dict["test"]["y"]
            
            mse = mean_squared_error(actual_y, pred_y)
            # r2_coef = r2_score(actual_y, pred_y)

            test_err_list.append(pd.DataFrame({
                "Mean Squared Error": mse,
                # "Root Mean Squared Error": np.sqrt(mse),
                # "R Squared": pearsonr(actual_y, pred_y)[0] ** 2,
                # "R2 Score": r2_coef,
                "FoldID": fold_id,
                "# of Total Samples": n_sub_samples,
                "Index of Subsample": ss_index,
                "Dataset": data_set_name,
                "Index of Predicted Column": index_of_pred_col,
                "Algorithm": learner_name,
            }, index=[0]))

main_test_err_df = pd.concat(test_err_list)
print(main_test_err_df)


len(sys.argv)=11 so trying first param
   Mean Squared Error  FoldID  # of Total Samples  Index of Subsample  \
0            1.307276       0                  10                   0   
0            1.223761       0                  10                   0   
0            0.989038       1                  10                   0   
0            0.971309       1                  10                   0   
0            0.463605       2                  10                   0   
0            0.988257       2                  10                   0   
0            0.583977       0                  10                   1   
0            0.524363       0                  10                   1   
0            0.146721       1                  10                   1   
0            0.540605       1                  10                   1   
0            2.217096       2                  10                   1   
0            2.208465       2                  10                   1   
0           