In [None]:
##### loading and centering data

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# reading in full data files
gene_expression = pd.read_csv(('~/Zhang-Lab/Zhang Lab Data/Full data files/Geneexpression (full).tsv'), sep='\t', header=0)
tf_expression = pd.read_csv(('~/Zhang-Lab/Zhang Lab Data/Full data files/TF(full).tsv'), sep='\t', header=0)

In [None]:
# test-train splitting
x = tf_expression
y = gene_expression

# first split: 70% train and 30% temp (test + val)
x_train, x_temp, y_train, y_temp = train_test_split(
    x, y, test_size=0.3, random_state=42)

# second split: 20% test and 10% val (which is 2/3 and 1/3 of temp)
x_test, x_val, y_test, y_val = train_test_split(
    x_temp, y_temp, test_size=1/3, random_state=42)

In [None]:
# converting to numpy for for mean centering
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()

x_val = x_val.to_numpy()
y_val = y_val.to_numpy()

x_test = x_test.to_numpy()
y_test = y_test.to_numpy()

In [None]:
# column-wise mean centering for each set (each gene is a column, each row is an instance)
# x centering -> stabilises model numeric conditioning (sensitivity to input qualities)
# y centering -> reduces structural multicollinearity, 
# predictions become representative of mean gene expression, improves generalization by removing absolute scale artifacts

x_train_col_means = x_train.mean(axis=0)
x_train_centered = x_train - x_train_col_means

y_train_col_means = y_train.mean(axis=0)
y_train_centered = y_train - y_train_col_means

# for test set 
x_test_centered = x_test - x_train_col_means
y_test_centered = y_test - y_train_col_means

# for val set
x_val_centered = x_val - x_train_col_means
y_val_centered = y_val - y_train_col_means

In [None]:
# Saving centered sets

# column means
np.savetxt('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Centered Data for RNN training/Column means/x_train_col_means.tsv', x_train_col_means, delimiter='\t', fmt='%.18e')
np.savetxt('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Centered Data for RNN training/Column meansy_train_col_means.tsv', y_train_col_means, delimiter='\t', fmt='%.18e')

# centered training data
np.savetxt('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Centered Data for RNN training/Train/x_train_centered.tsv', x_train_centered, delimiter='\t', fmt='%.18e')
np.savetxt('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Centered Data for RNN training/Train/y_train_centered.tsv', y_train_centered, delimiter='\t', fmt='%.18e')

# centered test data
np.savetxt('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Centered Data for RNN training/Test/x_test_centered.tsv', x_test_centered, delimiter='\t', fmt='%.18e')
np.savetxt('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Centered Data for RNN training/Test/y_test_centered.tsv', y_test_centered, delimiter='\t', fmt='%.18e')

# centered validation data
np.savetxt('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Centered Data for RNN training/Validation/x_val_centered.tsv', x_val_centered, delimiter='\t', fmt='%.18e')
np.savetxt('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Centered Data for RNN training/Validation/y_val_centered.tsv', y_val_centered, delimiter='\t', fmt='%.18e')
