In [None]:
import sys
import os
import pandas as pd
import warnings
import numpy as np
from datetime import date
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
sys.path.append(os.path.abspath("/projects/genomic-ml/da2343/ml_project_1/shared"))
from model_header import *
from constants import *

In [None]:
warnings.filterwarnings("ignore")
np.set_printoptions(threshold=np.inf)
params_df = pd.read_csv("params.csv")
param_row = 0
param_dict = dict(params_df.iloc[param_row, :])
data_set_name = param_dict["Dataset"]
n_samples = param_dict["# of Total Samples"]
index_of_pred_col = param_dict["Index of Prediction Col"]

dataset_path = dataset_dict[data_set_name]
n_splits = 3
# Import the csv file of the dataset
df = pd.read_csv(dataset_path, header=0)
learner_dict = {
    "Featureless": Featureless(),
    "LassoCV": LassoCV(random_state=1),
}

test_err_list = []
pred_col_name = df.columns[index_of_pred_col]
output_vec = df.iloc[:, index_of_pred_col].to_numpy().ravel()
input_mat = df.drop(pred_col_name, axis=1).to_numpy()

In [None]:
output_vec.shape

In [None]:
k_fold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
for fold_id, indices in enumerate(k_fold.split(input_mat)):
    index_dict = dict(zip(["train", "test"], indices))
    set_data_dict = {}
    for set_name, index_vec in index_dict.items():
        set_data_dict[set_name] = {
            "X": input_mat[index_vec],
            "y": output_vec[index_vec],
        }
    # Fit the learner to the training data
    # Predict the test data
    # Calculate the test error
    for learner_name, learner in learner_dict.items():
        learner.fit(**set_data_dict["train"])
        pred_y = learner.predict(set_data_dict["test"]["X"])
        actual_y = set_data_dict["test"]["y"]
        mse = mean_squared_error(actual_y, pred_y)
        test_err_list.append(
            pd.DataFrame(
                {
                    "Mean Squared Error": mse,
                    "FoldID": fold_id,
                    "# of Total Samples": n_samples,
                    "Dataset": data_set_name,
                    "Index of Predicted Column": index_of_pred_col,
                    "pred_col_name": pred_col_name,
                    "Algorithm": learner_name
                },
                index=[0],
            )
        )
test_err_df = pd.concat(test_err_list)

In [None]:
# Save dataframe as a csv to output directory
out_file = f"results/{param_row}.csv"
test_err_df.to_csv(out_file, encoding="utf-8", index=False)
print("Done!!")

In [None]:
import sys
import os
import pandas as pd
import warnings
import numpy as np
from datetime import date
from sklearn.linear_model import *
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
sys.path.append(os.path.abspath("/projects/genomic-ml/da2343/ml_project_1/shared"))
from model_header import *
from constants import *
from sklearn.dummy import *


warnings.filterwarnings("ignore")
np.set_printoptions(threshold=np.inf)
params_df = pd.read_csv("params.csv")
param_row = 0
param_dict = dict(params_df.iloc[param_row, :])
data_set_name = param_dict["Dataset"]
index_of_pred_col = param_dict["Index of Prediction Col"]

dataset_path = dataset_dict[data_set_name]
n_splits = 3
# Import the csv file of the dataset
df = pd.read_csv(dataset_path, header=0)
reg_learner_dict = {
    "Featureless": Featureless(),
    "LassoCV": LassoCV(random_state=1),
}
classifier_dict = {
    "FeaturelessClassifier": DummyClassifier(strategy="most_frequent"),
    "LogisticRegression": LogisticRegressionCV(random_state=1),
}

test_err_list = []
pred_col_name = df.columns[index_of_pred_col]

# two output vectors
# one will just be the output of the regression
output_vec_for_reg = df.iloc[:, index_of_pred_col].to_numpy().ravel()
# the other will be the output of the binary classifier
# when the elements in output_vec_for_reg is greater than 0 then the corresponding element in output_vec_classifier will be 1 otherwise it will be 0
output_vec_for_class = np.where(output_vec_for_reg > 0, 1, 0)
input_mat = df.drop(pred_col_name, axis=1).to_numpy()


k_fold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
for fold_id, indices in enumerate(k_fold.split(input_mat)):
    index_dict = dict(zip(["train", "test"], indices))
    set_data_dict = {}
    for set_name, index_vec in index_dict.items():
        set_data_dict[set_name] = {
            "X": input_mat[index_vec],
            "y": output_vec_for_class[index_vec],
        }
    # Fit the learner to the training data
    # Predict the test data
    # Calculate the test error
    for learner_name, learner in classifier_dict.items():
        learner.fit(**set_data_dict["train"])
        pred_y = learner.predict(set_data_dict["test"]["X"])
        actual_y = set_data_dict["test"]["y"]
        accuracy = np.mean(pred_y == actual_y)
        test_err_list.append(
            pd.DataFrame(
                {
                    "Test Accuracy": accuracy,
                    "FoldID": fold_id,
                    "Dataset": data_set_name,
                    "Index of Predicted Column": pred_col_name,
                    "Algorithm": learner_name
                },
                index=[0],
            )
        )
test_err_df = pd.concat(test_err_list)

In [7]:
import sys
import os
import pandas as pd
import warnings
import numpy as np
from datetime import date
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.metrics import *
sys.path.append(os.path.abspath("/projects/genomic-ml/da2343/ml_project_1/shared"))
from model_header import *
from constants import *
from sklearn.dummy import *

warnings.filterwarnings("ignore")
np.set_printoptions(threshold=np.inf)
params_df = pd.read_csv("params.csv")

# if len(sys.argv) == 2:
#     prog_name, task_str = sys.argv
#     param_row = int(task_str)
# else:
#     print("len(sys.argv)=%d so trying first param" % len(sys.argv))
#     param_row = 0

param_row = 0
param_dict = dict(params_df.iloc[param_row, :])
data_set_name = param_dict["Dataset"]
index_of_pred_col = param_dict["Index of Prediction Col"]

dataset_path = dataset_dict[data_set_name]
n_splits = 3
# Import the csv file of the dataset
df = pd.read_csv(dataset_path, header=0)

classifier_reg_dict = {
    "Featureless": {
        "classifier": DummyClassifier(strategy="most_frequent"),
        "regressor": Featureless(),
     },
    "LogisticRegLassoCV": {
        "classifier": LogisticRegressionCV(random_state=1),
        "regressor": LassoCV(random_state=1),
    }
}    
    

test_err_list = []
pred_col_name = df.columns[index_of_pred_col]

# two output vectors
# one will just be the output of the regression
output_vec_for_reg = df.iloc[:, index_of_pred_col].to_numpy().ravel()
output_vec_for_class = np.where(output_vec_for_reg > 0, 1, 0)
input_mat = df.drop(pred_col_name, axis=1).to_numpy()


k_fold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
for fold_id, indices in enumerate(k_fold.split(input_mat)):
    index_dict = dict(zip(["train", "test"], indices))
    set_data_dict = {}
    for set_name, index_vec in index_dict.items():
        set_data_dict[set_name] = {
            "X": input_mat[index_vec],
            "y_class": output_vec_for_class[index_vec],
            "y_reg": output_vec_for_reg[index_vec],
        }
    # Fit the learner to the training data
    # Predict the test data
    # Calculate the test error
    for learner_name, learner in classifier_reg_dict.items():
        # check if y_train contains only one class
        if len(np.unique(set_data_dict["train"]["y_class"])) == 1:
            # predict all test data as the class that is present in y_train
            pred_y = np.full(set_data_dict["test"]["y_class"].shape, 
                             np.unique(set_data_dict["train"]["y_class"])[0])
        else:
            classifier = learner["classifier"]
            classifier.fit(set_data_dict["train"]["X"], set_data_dict["train"]["y_class"])
            classifier_pred_y = classifier.predict(set_data_dict["test"]["X"])
            
            regressor = learner["regressor"]
            regressor.fit(set_data_dict["train"]["X"], set_data_dict["train"]["y_reg"])
            regressor_pred_y = regressor.predict(set_data_dict["test"]["X"])
            # when the classifier predicts 0 then the regressor prediction is set to 0
            # when the classifier predicts 1 then the regressor prediction is set to the regressor prediction
            pred_y = np.where(classifier_pred_y == 0, 0, regressor_pred_y)
            
        actual_y = set_data_dict["test"]["y_reg"]
        test_err_list.append(
            pd.DataFrame(
                {
                    "Mean Squared Error": mean_squared_error(actual_y, pred_y),
                    "FoldID": fold_id,
                    "Dataset": data_set_name,
                    "Index of Predicted Column": index_of_pred_col,
                    "Predicted Column Name": pred_col_name,
                    "Algorithm": learner_name
                },
                index=[0],
            )
        )
test_err_df = pd.concat(test_err_list)
# Save dataframe as a csv to output directory
# out_file = f"results/{param_row}.csv"
# test_err_df.to_csv(out_file, encoding="utf-8", index=False)
# print("Done!!")

In [8]:
test_err_df

Unnamed: 0,Mean Squared Error,FoldID,Dataset,Index of Predicted Column,Predicted Column Name,Algorithm
0,1.238438,0,Dec22_all_power,0,Bacillus,Featureless
0,1.022148,0,Dec22_all_power,0,Bacillus,LogisticRegLassoCV
0,0.964928,1,Dec22_all_power,0,Bacillus,Featureless
0,0.948843,1,Dec22_all_power,0,Bacillus,LogisticRegLassoCV
0,0.826767,2,Dec22_all_power,0,Bacillus,Featureless
0,0.643524,2,Dec22_all_power,0,Bacillus,LogisticRegLassoCV


In [9]:
df

Unnamed: 0,Bacillus,Bradyrhizobium,Burkholderia,Cellvibrio,Chitinophaga,Flavobacterium,Gp16,Gp6,Kaistia,Labrys,...,Mortierella,Mucor,Ovicillium,Penicillium,Phialocephala,Russula,Tomentella,Trichoderma,Umbelopsis,Wilcoxina
0,-1.051310,1.303894,-0.424502,-0.662478,-1.105309,-0.965418,1.026884,1.416732,-0.755748,-1.218715,...,0.311410,-0.387194,-0.546835,1.342957,-0.155797,0.862388,0.497154,-0.207413,0.961926,-0.387296
1,-1.051310,1.268659,-1.388676,-0.662478,-1.105309,-0.212046,1.440550,1.504009,-0.755748,-0.489037,...,-0.992687,-0.387194,-0.546835,1.583523,0.383227,1.368293,-0.752548,-1.424537,1.369599,-0.387296
2,-0.239649,1.200597,-1.388676,-0.662478,-1.105309,-0.965418,1.380000,1.474802,-0.755748,-0.489037,...,-1.208840,-0.387194,-0.546835,1.073953,-0.887091,1.253182,-1.283232,-1.424537,-0.872391,-0.387296
3,1.408456,1.249316,0.384082,-0.662478,-1.105309,-0.965418,1.203273,1.463288,-0.755748,-1.218715,...,0.179118,-0.387194,-0.546835,1.352706,-0.887091,1.688586,0.712848,-0.748688,1.028229,-0.387296
4,-0.340355,1.373472,-0.156500,-0.662478,-1.105309,0.137532,1.468894,1.514489,-0.755748,-0.489037,...,-1.208840,-0.387194,-0.546835,0.813602,0.836798,1.596822,0.590149,-1.424537,0.770741,-0.387296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,-0.152223,-1.093004,1.087809,-0.662478,0.524933,-0.965418,-0.724456,-0.705626,-0.755748,-0.118212,...,-0.223561,-0.387194,-0.546835,-0.922557,-0.887091,-0.893952,0.781099,1.268884,-0.872391,-0.387296
65,-1.493826,-0.570572,1.125368,-0.662478,0.394902,-0.965418,-0.724456,-0.705626,-0.755748,-1.218715,...,-1.569465,-0.387194,-0.546835,0.179539,1.413702,-0.893952,-1.283232,1.175973,-0.872391,2.567993
66,-1.493826,-0.113313,1.679608,-0.662478,-0.106708,-0.965418,-0.724456,-0.705626,-0.755748,0.713844,...,1.228625,-0.387194,-0.546835,-0.169313,1.435482,0.607643,1.415446,0.839282,-0.872391,-0.387296
67,-0.459163,-1.093004,-1.388676,1.041395,-0.106708,-0.965418,-0.724456,-0.705626,1.683835,1.160502,...,0.154034,2.431741,-0.546835,-0.922557,1.651623,0.524723,0.318935,0.617299,0.622225,-0.387296
