# Create Data sets for testing flowml

In [91]:
import numpy as np
import pandas as pd
import os
import json
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split


## Create data directory

In [92]:
path_to_data = str("%s/%s" % (os.getcwd(), "data"))
print(path_to_data)
path_to_results = str("%s/%s" % (path_to_data, "results"))
print(path_to_results)

if not(os.path.isdir(path_to_data)):
  os.mkdir(path_to_data) 

if not(os.path.isdir(path_to_results)):
    os.mkdir(path_to_results)  

/home/malkusch/code/flowml/data


## Define simulation parameters

In [93]:
n_samples=150
n_features= 25
n_informative= 10
shuffle_indicator=True
coef_indicator=True
test_size = 0.33
random_state=42
gene_names = []
sample_ids = []

for i in np.arange(n_features):
    gene_name = str("gene_%02i" % (i+1))
    gene_names.append(gene_name)

for i in np.arange(n_samples):
    sample_id = str("sample_%03i" % i)
    sample_ids.append(sample_id)

## Make regression

In [101]:
X, y, coefs = make_regression(n_samples = n_samples,
                             n_features= n_features,
                             n_informative= n_informative,
                             shuffle = shuffle_indicator,
                             coef = coef_indicator,
                             random_state = random_state)

reg_data_df = pd.DataFrame(X, columns=gene_names)
reg_data_df["response"] = y
reg_data_df["sample_id"] = sample_ids

reg_coef_df = pd.DataFrame({"features": gene_names,
                            "coef": coefs})

## Save results

In [102]:
path_to_reg_data = str("%s/reg_data.csv" % path_to_data)
reg_data_df.to_csv(path_to_reg_data)
path_to_reg_coef = str("%s/reg_coef.csv" % path_to_data)
reg_coef_df.to_csv(path_to_reg_coef)

In [96]:
path_to_reg_features = str("%s/reg_features.txt" % path_to_data)
reg_coef_df[reg_coef_df["coef"]>20][["features"]].to_csv(path_to_reg_features, header=False, index=False)

path_to_reg_features_ext = str("%s/reg_features_extended.txt" % path_to_data)
reg_coef_df[reg_coef_df["coef"]<=20][["features"]].to_csv(path_to_reg_features_ext, header=False, index=False)

In [97]:
train_idx, test_idx =train_test_split(np.arange(n_samples),
                                      test_size = test_size,
                                      random_state = random_state)

path_to_reg_train = str("%s/reg_samples_train.txt" % path_to_data)
pd.DataFrame(np.array(sample_ids)[train_idx]).to_csv(path_to_reg_train, header=False, index=False)

path_to_reg_test = str("%s/reg_samples_test.txt" % path_to_data)
pd.DataFrame(np.array(sample_ids)[test_idx]).to_csv(path_to_reg_test, header=False, index=False)


In [104]:
config_dict = {
    "fit.id": "testFitRegression",
    "ml.sampleID": "sample_id",
    "ml.type": "regression",
    "ml.response": "response",
    "ml.method": "pls",
    "ml.preprocess": ["center", "scale"],
    "ml.seed": "42",
    "ml.cv": {
      "method": "repeatedcv",
      "fold": "5",
      "repeats": "10",
      "grid.library": "NULL",
      "tune.grid": "none",
      "tune.length": "10"
    },
    "ml.bootstrap": {
      "n.resamples": "10",
      "n.permutations": "100",
      "strata.var": "NULL"
    },
    "ml.interpret":{
      "n.repeats": "100"
    },
  "note": "I like machine learning"
}

path_to_reg_config = str("%s/reg_config.json" % path_to_data)
with open(path_to_reg_config, 'w') as fp:
    json.dump(config_dict, fp)