In [None]:
import sys
import os
import pandas as pd
import warnings
import numpy as np
from datetime import date
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
sys.path.append(os.path.abspath("/projects/genomic-ml/da2343/ml_project_1/shared"))
from model_header import *
from constants import *

In [None]:
warnings.filterwarnings("ignore")
np.set_printoptions(threshold=np.inf)
params_df = pd.read_csv("params.csv")
param_row = 0
param_dict = dict(params_df.iloc[param_row, :])
data_set_name = param_dict["Dataset"]
index_of_pred_col = param_dict["Index of Prediction Col"]

dataset_path = dataset_dict[data_set_name]
n_splits = 3
# Import the csv file of the dataset
df = pd.read_csv(dataset_path, header=0)
learner_dict = {
    "Featureless": Featureless(),
    "LassoCV": LassoCV(random_state=1),
}

test_err_list = []
pred_col_name = df.columns[index_of_pred_col]
output_vec = df.iloc[:, index_of_pred_col].to_numpy().ravel()
input_mat = df.drop(pred_col_name, axis=1).to_numpy()

In [None]:
pred_col_name

In [None]:
k_fold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
for fold_id, indices in enumerate(k_fold.split(input_mat)):
    index_dict = dict(zip(["train", "test"], indices))
    set_data_dict = {}
    for set_name, index_vec in index_dict.items():
        set_data_dict[set_name] = {
            "X": input_mat[index_vec],
            "y": output_vec[index_vec],
        }
    # Fit the learner to the training data
    # Predict the test data
    # Calculate the test error
    for learner_name, learner in learner_dict.items():
        learner.fit(**set_data_dict["train"])
        pred_y = learner.predict(set_data_dict["test"]["X"])
        actual_y = set_data_dict["test"]["y"]
        mse = mean_squared_error(actual_y, pred_y)
        test_err_list.append(
            pd.DataFrame(
                {
                    "Mean Squared Error": mse,
                    "FoldID": fold_id,
                    "# of Total Samples": n_samples,
                    "Dataset": data_set_name,
                    "Index of Predicted Column": index_of_pred_col,
                    "pred_col_name": pred_col_name,
                    "Algorithm": learner_name
                },
                index=[0],
            )
        )
test_err_df = pd.concat(test_err_list)

In [None]:
# Save dataframe as a csv to output directory
out_file = f"results/{param_row}.csv"
test_err_df.to_csv(out_file, encoding="utf-8", index=False)
print("Done!!")

In [None]:
import sys
import os
import pandas as pd
import warnings
import numpy as np
from datetime import date
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.metrics import *
sys.path.append(os.path.abspath("/projects/genomic-ml/da2343/ml_project_1/shared"))
from model_header import *
from constants import *
from sklearn.dummy import *


warnings.filterwarnings("ignore")
np.set_printoptions(threshold=np.inf)
params_df = pd.read_csv("params.csv")
param_row = 0
param_dict = dict(params_df.iloc[param_row, :])
data_set_name = param_dict["Dataset"]
index_of_pred_col = param_dict["Index of Prediction Col"]

dataset_path = dataset_dict[data_set_name]
n_splits = 3
# Import the csv file of the dataset
df = pd.read_csv(dataset_path, header=0)
reg_learner_dict = {
    "Featureless": Featureless(),
    "LassoCV": LassoCV(random_state=1),
}
classifier_dict = {
    "FeaturelessClassifier": DummyClassifier(strategy="most_frequent"),
    "LogisticRegression": LogisticRegressionCV(random_state=1),
}

test_err_list = []
pred_col_name = df.columns[index_of_pred_col]

# two output vectors
# one will just be the output of the regression
output_vec_for_reg = df.iloc[:, index_of_pred_col].to_numpy().ravel()
# the other will be the output of the binary classifier
# when the elements in output_vec_for_reg is greater than 0 then the corresponding element in output_vec_classifier will be 1 otherwise it will be 0
output_vec_for_class = np.where(output_vec_for_reg > 0, 1, 0)
input_mat = df.drop(pred_col_name, axis=1).to_numpy()


k_fold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
for fold_id, indices in enumerate(k_fold.split(input_mat)):
    index_dict = dict(zip(["train", "test"], indices))
    set_data_dict = {}
    for set_name, index_vec in index_dict.items():
        set_data_dict[set_name] = {
            "X": input_mat[index_vec],
            "y": output_vec_for_class[index_vec],
        }
    # Fit the learner to the training data
    # Predict the test data
    # Calculate the test error
    for learner_name, learner in classifier_dict.items():
        learner.fit(**set_data_dict["train"])
        pred_y = learner.predict(set_data_dict["test"]["X"])
        actual_y = set_data_dict["test"]["y"]
        accuracy = np.mean(pred_y == actual_y)
        
        fpr, tpr, _ = roc_curve(actual_y, pred_y)
        
        test_err_list.append(
            pd.DataFrame(
                {
                    "Test Accuracy": accuracy,
                    "FoldID": fold_id,
                    "Dataset": data_set_name,
                    "Index of Predicted Column": pred_col_name,
                    "Algorithm": learner_name,
                    "AUC": roc_auc_score(actual_y, pred_y),
                    "FPR": np.array2string(fpr),
                    "TPR": np.array2string(tpr),
                },
                index=[0],
            )
        )
test_err_df = pd.concat(test_err_list)

In [None]:
test_err_df

In [None]:
fpr_string = np.array2string(fpr)
fpr_string

In [None]:
# string to numpy array
fpr = np.fromstring(fpr_string[1:-1], sep=" ")
fpr

In [None]:
# convert the fpr and tpr to a string
type(fpr)

In [None]:
output = roc_curve(actual_y, pred_y)
type(output)

In [3]:
import sys
import os
import pandas as pd
import warnings
import numpy as np
from datetime import date
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.metrics import *
sys.path.append(os.path.abspath("/projects/genomic-ml/da2343/ml_project_1/shared"))
from model_header import *
from constants import *
from sklearn.dummy import *

warnings.filterwarnings("ignore")
np.set_printoptions(threshold=np.inf)
params_df = pd.read_csv("params.csv")


param_row = 146

param_dict = dict(params_df.iloc[param_row, :])
data_set_name = param_dict["Dataset"]
index_of_pred_col = param_dict["Index of Prediction Col"]

dataset_path = dataset_dict[data_set_name]
n_splits = 3
# Import the csv file of the dataset
df = pd.read_csv(dataset_path, header=0)

classifier_reg_dict = {
    "Featureless": {
        "classifier": None,
        "regressor": Featureless(),
     },
    "LassoCV": {
        "classifier": None,
        "regressor": LassoCV(random_state=1),
    },
    "LogisticRegLassoCV": {
        "classifier": LogisticRegressionCV(),
        "regressor": LassoCV(random_state=1, cv=3),
    },
}    
    

test_err_list = []
pred_col_name = df.columns[index_of_pred_col]

# two output vectors
# one will just be the output of the regression
output_vec_for_reg = df.iloc[:, index_of_pred_col].to_numpy().ravel()
output_vec_for_class = np.where(output_vec_for_reg > 0, 1, 0)
input_mat = df.drop(pred_col_name, axis=1).to_numpy()


k_fold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
for fold_id, indices in enumerate(k_fold.split(input_mat)):
    index_dict = dict(zip(["train", "test"], indices))
    set_data_dict = {}
    for set_name, index_vec in index_dict.items():
        set_data_dict[set_name] = {
            "X": input_mat[index_vec],
            "y_class": output_vec_for_class[index_vec],
            "y_reg": output_vec_for_reg[index_vec],
        }
    # Fit the learner to the training data
    # Predict the test data
    # Calculate the test error
    for learner_name, learner in classifier_reg_dict.items():
        # check if y_train contains only one class
        if len(np.unique(set_data_dict["train"]["y_class"])) == 1:
            # predict all test data as the class that is present in y_train
            pred_y = np.full(set_data_dict["test"]["y_class"].shape, 
                             np.unique(set_data_dict["train"]["y_class"])[0])
        else:
            classifier = learner["classifier"]
            if classifier != None:
                classifier.fit(set_data_dict["train"]["X"], set_data_dict["train"]["y_class"])
                classifier_pred_y = classifier.predict(set_data_dict["test"]["X"])
                # fit regressor only on rows where y_class is 1
                X_reg_train = set_data_dict["train"]["X"][set_data_dict["train"]["y_class"] == 1]
                y_reg_train = set_data_dict["train"]["y_reg"][set_data_dict["train"]["y_class"] == 1]
                # check if y_reg_train is empty
                if len(y_reg_train) != 0:
                    regressor = learner["regressor"]
                    regressor.fit(X_reg_train, y_reg_train)
                    regressor_pred_y = regressor.predict(set_data_dict["test"]["X"])
                    pred_y = np.where(classifier_pred_y == 0, 0, regressor_pred_y)
                else:
                    pred_y = classifier_pred_y
            else:
                regressor = learner["regressor"]
                regressor.fit(set_data_dict["train"]["X"], set_data_dict["train"]["y_reg"])
                pred_y = regressor.predict(set_data_dict["test"]["X"])
            
        actual_y = set_data_dict["test"]["y_reg"]
        actual_pred_df = pd.DataFrame(
            {
                "Actual": actual_y,
                "Predicted": pred_y,
            }
        )
        actual_pred_df["FoldID"] = fold_id
        actual_pred_df["Dataset"] = data_set_name
        actual_pred_df["Index of Predicted Column"] = index_of_pred_col
        actual_pred_df["Predicted Column Name"] = pred_col_name
        actual_pred_df["Algorithm"] = learner_name
        
        test_err_list.append(actual_pred_df)
test_err_df = pd.concat(test_err_list)
# Save dataframe as a csv to output directory
# out_file = f"results/{param_row}.csv"
# test_err_df.to_csv(out_file, encoding="utf-8", index=False)
# print("Done!!")

In [4]:
test_err_df

Unnamed: 0,Actual,Predicted,FoldID,Dataset,Index of Predicted Column,Predicted Column Name,Algorithm
0,2.517898,1.628575,0,ioral,14,Enterococcus,Featureless
1,1.768950,1.628575,0,ioral,14,Enterococcus,Featureless
2,0.778986,1.628575,0,ioral,14,Enterococcus,Featureless
3,0.000000,1.628575,0,ioral,14,Enterococcus,Featureless
4,1.411672,1.628575,0,ioral,14,Enterococcus,Featureless
...,...,...,...,...,...,...,...
23,2.543781,1.227253,2,ioral,14,Enterococcus,LogisticRegLassoCV
24,2.013784,1.802523,2,ioral,14,Enterococcus,LogisticRegLassoCV
25,1.361567,1.660577,2,ioral,14,Enterococcus,LogisticRegLassoCV
26,2.576519,1.712878,2,ioral,14,Enterococcus,LogisticRegLassoCV


In [None]:
df