# FantomData

### Processes the Fantom Data matrix by removing likely false positives.

## Imports

In [17]:
import pandas as pd
import numpy as np
import random

## Parameters

In [18]:
random.seed(11) # Setting seed to 11 for reproducibility

In [19]:
# Percentages to keep for each set
train_ratio = 0.7
valid_ratio = 0.15
test_ratio = 0.15

In [None]:
min_len = 100
max_len = 600

## Files

In [28]:
# Input files
data_dir = '../Data/FANTOM/' # Directory where input files are
fantom_file = f'{data_dir}/F5.hg38.enhancers.expression.usage.matrix' # fantom matrix file
fasta_dir = f'../Data/HG38/' # directory to HG38 chromosomes

In [None]:
# Output files
train_fantom = f'{data_dir}/train.usage.matrix'
valid_fantom = f'{data_dir}/valid.usage.matrix'
test_fantom = f'{data_dir}/test.usage.matrix'

# Data

In [29]:
df = pd.read_csv(fantom_file, sep='\t')
print(len(df))
print(len(df.columns))

63285
1829


# Cleaning dataframe by removing any without active cell types

In [30]:
row_sum_list = df.sum(axis=1)
df_clean = df[row_sum_list > 0]
df_clean = df_clean.astype(np.ubyte)   
print(len(df_clean))
print(len(df_clean.columns))

60215
1829


# Cleaning dataframe by removing smaller than 100 and greater than 600

In [31]:
# Note that enhancers in the fantom matrix are marked as... chr{i}:{start}-{end}

keep_list = []
for label in list(df_clean.index):
    colon = label.find(":")
    dash = label.find("-")

    start = int(label[colon + 1:dash])
    end = int(label[dash + 1:])
    size = end-start
    if size >= min_len and size <= max_len:
        keep_list.append(label)

In [32]:
len(keep_list)

52789

## Splitting dataset into train, valid, and test

In [33]:
random.shuffle(keep_list)
train_cutoff = int(len(keep_list) * train_ratio)
valid_cutoff = int(len(keep_list) * (train_ratio + valid_ratio))

df_train = df_clean.loc[keep_list[:train_cutoff]]
df_valid = df_clean.loc[keep_list[train_cutoff:valid_cutoff]]
df_test = df_clean.loc[keep_list[valid_cutoff:]]

In [34]:
print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

(36952, 1829)
(7918, 1829)
(7919, 1829)


## Saving

In [35]:
df_train.to_csv(train_fantom, sep='\t')
df_valid.to_csv(valid_fantom, sep='\t')
df_test.to_csv(test_fantom, sep='\t')