**Author:** Zhanyuan Zhang  
**Purpose:** Based on *count_motifs.ipynb*, this notebook generates the motifs' frequency data and then output them as a *.csv* file.  
**Usage:** 
* Modify *data_path*, *words_path*, and *output_file* accordingly.
* Update *motifs* to make sure it contains all the motifs that we need to consider.

In [1]:
import os, re, pickle, pandas

In [2]:
# path of the FASTA files
data_path = "/home/zhanyuan/uc-berkeley/discoverydna/team_neural_network/data/input/3.24_species_only/"

# path of words generated by motifs' pwm
words_path = "/home/zhanyuan/uc-berkeley/discoverydna/team_neural_network/data/input/jaspar_pwm/words/"

# output file
output_file = "/home/zhanyuan/uc-berkeley/discoverydna/team_neural_network/data/input/motif_freq.csv"

all_files = os.listdir(data_path)
# Make sure that the file list only contains .fa files that we need.
all_files = [file for file in all_files if file != "results"]

# The buffer of the python list of words named by "words_generated_by_[replace with motif name]"
words_filename_prefix = "words_generated_by_" 
motifs = ["bcd", "cad", "gt", "hb"]

In [3]:
# Define a bunch of helper functions

def get_region_id(filename):
    """
    >>> get_region_id('outlier_rm_with_length_VT55793.fa')
    'VT55793'
    """
    return re.findall("[A-Z0-9]+", filename)[0]

def detect_title(region_id, line):
    """
    >>> region_id = re.compile("VT42806")
    >>> detect_title(region_id, '>VT42806|1|dkik|-|2561')
    True
    """
    return len(re.findall(region_id, line)) != 0

def get_indicator(title):
    """
    >>> get_species_id(">VT42806|1|dkik|-|2561")
    '1'
    """
    return re.split("\|", title)[1]

def get_species_id(title):
    """
    >>> get_species_id(">VT42806|1|dkik|-|2561")
    'dkik'
    """
    return re.split("\|", title)[2]

In [4]:
column_names = ["region_id.special_id"]
column_names.extend(motifs)
column_names.append("expression")
output_dataframe = pandas.DataFrame(columns=column_names)
output_dataframe

Unnamed: 0,region_id.special_id,bcd,cad,gt,hb,expression


In [5]:
file_count = 1
for file in all_files:
    region_id = get_region_id(file) # Get the region id of the current file
    
    file_path = data_path + file
    data = open(file_path, "r").read()
    
    data_for_species = re.split("\n+", data) # Split by lines to find the list of species
    
    all_species = []
    function_indicators = []
    for line in data_for_species:
        if detect_title(region_id, line):
            species_id = get_species_id(line)
            indicator = get_indicator(line)
            all_species.append(species_id)
            function_indicators.append(indicator)
    
    assert len(all_species) == 24
    
    # Split by title to extrac the sequence of each species:
    data_for_sequence = re.split(">[A-Za-z0-9]*\|[A-Za-z0-9]\|[A-Za-z0-9]*\|.\|[0-9]*", data)
    # Capitalize all the letters and remove "\n" in sequences and possible empty string
    all_sequences = [re.sub("\n", "", seq.upper()) for seq in data_for_sequence if seq != ""]
    
    row_name = [region_id + "." + species_id for species_id in all_species]
    result = pandas.DataFrame(row_name, columns = ["region_id.special_id"],
                             index=list(range(24*(file_count - 1), 24*(file_count)))) # Index is for later merging
    
    for i in range(len(motifs)):
        counts_for_each_species = []
        
        words_filename = words_filename_prefix + motifs[i]
        with open(words_path + words_filename, "rb") as fo:
            words = pickle.load(fo, encoding = "bytes")
        
        for j in range(len(all_species)):
            count = 0
            for w in words:
                count += len(re.findall(w, all_sequences[j]))
            counts_for_each_species.append(count)
        result[motifs[i]] = counts_for_each_species # One column per motif
    result["expression"] = function_indicators # Indicator
    file_count += 1
    output_dataframe = pandas.concat([output_dataframe, result]) # Merge to the output dataframe

In [6]:
output_dataframe.to_csv(output_file, encoding='utf-8')