In [1]:
import pandas as pd

In [2]:
treatment_fimo = pd.read_csv('./TFBS Regression Modeling/trmt_fimo/fimo.tsv', header=0, sep='\t', comment='#')
narrowpeak = pd.read_csv('./TFBS Regression Modeling/dmel_s2_bampe_q01_peaks.narrowPeak', header=None, sep='\t', comment='#')

### Encoding based on Motif Appearances

#### Exploring the data:

How many entries and of what types?

In [3]:
treatment_fimo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62205 entries, 0 to 62204
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   motif_id          62205 non-null  object 
 1   motif_alt_id      62205 non-null  object 
 2   sequence_name     62205 non-null  object 
 3   start             62205 non-null  int64  
 4   stop              62205 non-null  int64  
 5   strand            62205 non-null  object 
 6   score             62205 non-null  float64
 7   p-value           62205 non-null  float64
 8   q-value           62205 non-null  float64
 9   matched_sequence  62205 non-null  object 
dtypes: float64(3), int64(2), object(5)
memory usage: 4.7+ MB


How many unique sequences?

In [4]:
sequences = list(treatment_fimo['sequence_name'].unique())
len(sequences)

2902

Out of the 28,896 total, we have only 2863 sequences. 

How many motifs?

In [5]:
unique_motifs = list(treatment_fimo['motif_alt_id'].unique())
unique_motifs.sort(key=lambda x: int(x.split('-')[-1]))
# We will sort the unique motifs list just so that it looks pretty.
len(unique_motifs)

36

Only 36! We must aggregate this data by both the sequence and then the motifs. Let us create a simple dictionary to aggregate our data:

In [6]:
motif_aggregation = dict.fromkeys(sequences, None)

for sequence in sequences:
    motif_aggregation[sequence] = dict.fromkeys(unique_motifs, 0)

first_key = list(motif_aggregation.keys())[0]
print(f'{first_key} has a dictionary of: {motif_aggregation[first_key]}')

X:5794883-5797178 has a dictionary of: {'STREME-1': 0, 'STREME-2': 0, 'STREME-3': 0, 'STREME-4': 0, 'STREME-5': 0, 'STREME-6': 0, 'STREME-7': 0, 'STREME-9': 0, 'STREME-10': 0, 'STREME-11': 0, 'STREME-12': 0, 'STREME-13': 0, 'STREME-14': 0, 'STREME-15': 0, 'STREME-16': 0, 'STREME-17': 0, 'STREME-18': 0, 'STREME-19': 0, 'STREME-20': 0, 'STREME-21': 0, 'STREME-22': 0, 'STREME-23': 0, 'STREME-25': 0, 'STREME-29': 0, 'STREME-31': 0, 'STREME-35': 0, 'STREME-36': 0, 'STREME-37': 0, 'STREME-38': 0, 'STREME-41': 0, 'STREME-42': 0, 'STREME-43': 0, 'STREME-44': 0, 'STREME-45': 0, 'STREME-46': 0, 'STREME-47': 0}


Let us fill this dictionary now.

In [7]:
for row_index, row in treatment_fimo.iterrows():
    count_dictionary = motif_aggregation[row['sequence_name']]
    count_dictionary[row['motif_alt_id']] += 1

print(f'{first_key} has a dictionary of: {motif_aggregation[first_key]}')

X:5794883-5797178 has a dictionary of: {'STREME-1': 0, 'STREME-2': 0, 'STREME-3': 0, 'STREME-4': 0, 'STREME-5': 0, 'STREME-6': 0, 'STREME-7': 0, 'STREME-9': 1, 'STREME-10': 1, 'STREME-11': 0, 'STREME-12': 30, 'STREME-13': 0, 'STREME-14': 1, 'STREME-15': 0, 'STREME-16': 0, 'STREME-17': 0, 'STREME-18': 0, 'STREME-19': 0, 'STREME-20': 0, 'STREME-21': 0, 'STREME-22': 1, 'STREME-23': 0, 'STREME-25': 0, 'STREME-29': 0, 'STREME-31': 2, 'STREME-35': 0, 'STREME-36': 0, 'STREME-37': 3, 'STREME-38': 0, 'STREME-41': 0, 'STREME-42': 0, 'STREME-43': 1, 'STREME-44': 30, 'STREME-45': 2, 'STREME-46': 0, 'STREME-47': 0}


Now let us create a pandas dictionary from these "rows" of counts we made. Remember! Since python 3.6 this is allowed because the insertion order is the same, and as such they key order for all of our dictionaries is the same.

In [8]:
column_names = list(motif_aggregation[first_key].keys())

all_rows = []
for key, row_dict in motif_aggregation.items():
    row_values = [key]
    for motif_name, appearances in row_dict.items():
        row_values.append(appearances)
    
    all_rows.append(row_values)

count_dataframe = pd.DataFrame(data=all_rows, columns=['sequence_name',*column_names])
print(f'Final shape of dataframe: {count_dataframe.shape}')
count_dataframe.head(2)

Final shape of dataframe: (2902, 37)


Unnamed: 0,sequence_name,STREME-1,STREME-2,STREME-3,STREME-4,STREME-5,STREME-6,STREME-7,STREME-9,STREME-10,...,STREME-36,STREME-37,STREME-38,STREME-41,STREME-42,STREME-43,STREME-44,STREME-45,STREME-46,STREME-47
0,X:5794883-5797178,0,0,0,0,0,0,0,1,1,...,0,3,0,0,0,1,30,2,0,0
1,3L:17576814-17578210,0,0,0,16,1,2,0,2,1,...,0,1,0,0,1,1,6,1,2,0


Engineering the narrowpeak file in order to add it to the above dataframe

In [9]:
sequence_name_generated = [f'{chr.strip()}:{start}-{end}' for chr, start, end in zip(narrowpeak.iloc[:,0], narrowpeak.iloc[:,1], narrowpeak.iloc[:,2])]

In [10]:
narrowpeak_only_height_name = pd.DataFrame()
narrowpeak_only_height_name['sequence_name'] = sequence_name_generated
# Columns follow this format: https://genome.ucsc.edu/FAQ/FAQformat.html#format12
# We will use the 4th column (0 indexed), or score.
narrowpeak_only_height_name['peak_height'] = narrowpeak.iloc[:,4]
narrowpeak_only_height_name.head(3)

Unnamed: 0,sequence_name,peak_height
0,211000022278091:232-1074,36
1,211000022278785:515-1338,350
2,211000022278925:77-768,123


Joining these dataframes based on sequence_name

In [11]:
joined_dataframe = count_dataframe.join(narrowpeak_only_height_name.set_index('sequence_name'), on='sequence_name')
joined_dataframe.head(3)

Unnamed: 0,sequence_name,STREME-1,STREME-2,STREME-3,STREME-4,STREME-5,STREME-6,STREME-7,STREME-9,STREME-10,...,STREME-37,STREME-38,STREME-41,STREME-42,STREME-43,STREME-44,STREME-45,STREME-46,STREME-47,peak_height
0,X:5794883-5797178,0,0,0,0,0,0,0,1,1,...,3,0,0,0,1,30,2,0,0,48
1,3L:17576814-17578210,0,0,0,16,1,2,0,2,1,...,1,0,0,1,1,6,1,2,0,654
2,2R:15744192-15745896,1,0,2,0,0,0,0,0,1,...,1,2,0,0,1,10,0,0,0,443


In [12]:
from functions import get_linear_regression_matrix

joined_df_function = get_linear_regression_matrix()
joined_df_function.head(3)

Unnamed: 0,sequence_name,STREME-1,STREME-2,STREME-3,STREME-4,STREME-5,STREME-6,STREME-7,STREME-9,STREME-10,...,STREME-37,STREME-38,STREME-41,STREME-42,STREME-43,STREME-44,STREME-45,STREME-46,STREME-47,peak_height
0,X:5794883-5797178,0,0,0,0,0,0,0,1,1,...,3,0,0,0,1,30,2,0,0,48
1,3L:17576814-17578210,0,0,0,16,1,2,0,2,1,...,1,0,0,1,1,6,1,2,0,654
2,2R:15744192-15745896,1,0,2,0,0,0,0,0,1,...,1,2,0,0,1,10,0,0,0,443
