# Testing notebook

In [1]:
import os 

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path)
print(f'Project path set to: {os.getcwd()}')


Project path set to: c:\Github\ode-biomarker-project


# Toolkit Tests

## Feature Transformer

In [2]:
import pandas as pd

# loading cell line proteomic expression data

cancercell2022 = pd.read_csv('data\preprocessed\SY-Processed\CancerCell2022_PRISM.csv')

cancercell2022_dropnan = cancercell2022.dropna(subset=['AUC'])

import DataFunctions as dfunc 

feature_data, label_data = dfunc.create_feature_and_label(cancercell2022_dropnan, label_name='AUC')

feature_data_no_row = feature_data.drop(['Row'], axis=1)

In [7]:
from toolkit import FeatureTransformer
from toolkit import impute_by_zero, impute_by_first_quantile, get_network_stat_features, get_random_features

F = FeatureTransformer()

F.add_transform_function('impute_by_zero', impute_by_zero)
F.add_selection_function('random_select', get_random_features, {"selection_size": 10})

In [6]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, random_state=42)

# Print the shapes of the new X objects

print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}, X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

# Run Feature Transformer 

selected_features, sel_train, sel_test = F.run(X_train, y_train, X_test)

print(selected_features, sel_train.shape, sel_test.shape)  

NameError: name 'feature_data' is not defined

In [None]:
# clear variables in juptyer notebook

%reset -f

## Feature Selection Methods

In [2]:
from sklearn.datasets import make_regression
import pandas as pd

# turn X and Y into dataframes
X, y = make_regression(n_samples=500, n_features=1000, n_informative=10, random_state=1, shuffle=False)

X = pd.DataFrame(X)
y = pd.Series(y)

# turn columns into strings

X.columns = [str(i) for i in range(X.shape[1])]

print(f'Original informative columns: {X.columns[:10]}')

# shuffle columns around for X

X = X.sample(frac=1, axis=1, random_state=0)

print(f'Newly shuffled columns: {X.columns[:10]}')


Original informative columns: Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype='object')
Newly shuffled columns: Index(['993', '859', '298', '553', '672', '971', '27', '231', '306', '706'], dtype='object')


In [3]:
from toolkit import mrmr_select_fcq

features, scores = mrmr_select_fcq(X, y, K=10,verbose=True)

  from .autonotebook import tqdm as notebook_tqdm


Iteration 1 selected 6 score 20349023.18055924 remaining 999 features
Iteration 2 selected 913 score 9236.404740325002 remaining 998 features
Iteration 3 selected 762 score 3338.49141856536 remaining 997 features
Iteration 4 selected 5 score 2401.434983185637 remaining 996 features
Iteration 5 selected 7 score 1958.8192067425084 remaining 995 features
Iteration 6 selected 1 score 1322.7038843133394 remaining 994 features
Iteration 7 selected 8 score 952.2757379286545 remaining 993 features
Iteration 8 selected 4 score 903.1707225760754 remaining 992 features
Iteration 9 selected 3 score 481.3610921002683 remaining 991 features
Iteration 10 selected 215 score 204.25794018077414 remaining 990 features


In [4]:
from toolkit import enet_select

features, scores = enet_select(X, y, 10, max_iter=10000, alpha=0.1, l1_ratio=0.7)

print(features)
print(scores)

[465 193 580  29 629 137 524 551 247   7]
[53.5348648  46.46452212 37.03393571 35.39034433 23.38224216 22.14676406
 15.05663697 12.11902955 11.56421631  9.09762054]


In [5]:
from toolkit import rf_select

features, scores = rf_select(X, y, k=10, n_estimators=100, max_depth=5, n_jobs=-1)

print(features)
print(scores)

[465 193 580  29 137 629 524 342  35 790]
[0.29646599 0.2238035  0.1167996  0.06645544 0.01195476 0.00679736
 0.00391873 0.00287384 0.00282981 0.00201532]


In [6]:
from toolkit import f_regression_select

features, scores = f_regression_select(X, y, k=10)
print(features)
print(scores)

Index(['1', '456', '8', '5', '6', '781', '3', '7', '4', '621'], dtype='object')
[ 62.83937464  11.86299028  28.68197294 157.83757371 203.49023181
   9.34440972  16.75903932 100.30555487  40.46596522   8.47049546]


In [8]:
from toolkit import relieff_select

features, scores = relieff_select(X, y, k=10, n_jobs=4)
print(features)
print(scores)


[675  99 765 635   2 307 301 770 328  75]
[0.00149231 0.00147896 0.00141029 0.00137384 0.00129534 0.00123079
 0.00122917 0.00122489 0.00121487 0.00120116]


In [None]:
# clear variables in juptyer notebook

%reset -f

## Selection Functions

## Transforming Functions


# TorchApp Tests