**Author:** Zhanyuan Zhang  
**Purpose:** This notebook counts the frequency of each type of motif in each region for 24 species.  
**Note:**  
   * For demonstration and debugging purposes, this notebook only shows the result of 20 regions. 
   * This notebook works on 20 .fa files in a folder I created locally called *test*, which is seperated from the *3.24_species_only*, the original file. One may need to take this extra step before using this notebook to avoid touching all the 3,543 .fa files in *3.24_species_only*.

In [1]:
import os, re, pickle, pandas

In [2]:
# path of the FASTA files
data_path = "/home/zhanyuan/uc-berkeley/discoverydna/team_neural_network/data/input/test/"

# path of words generated by motifs' pwm
words_path = "/home/zhanyuan/uc-berkeley/discoverydna/team_neural_network/data/input/jaspar_pwm/words/"

all_files = os.listdir(data_path)
# Make sure that the file list only contains .fa files that we need.
all_files = [file for file in all_files if file != "results"]

# The buffer of the python list of words named by "words_generated_by_[replace with motif name]"
words_filename_prefix = "words_generated_by_" 
motifs = ["bcd", "cad", "gt", "hb"]


In [3]:
# Define a bunch of helper functions

def get_region_id(filename):
    """
    >>> get_region_id('outlier_rm_with_length_VT55793.fa')
    'VT55793'
    """
    return re.findall("[A-Z0-9]+", filename)[0]

def detect_title(region_id, line):
    """
    >>> region_id = re.compile("VT42806")
    >>> detect_title(region_id, '>VT42806|1|dkik|-|2561')
    True
    """
    return len(re.findall(region_id, line)) != 0

def get_indicator(title):
    """
    >>> get_species_id(">VT42806|1|dkik|-|2561")
    '1'
    """
    return re.split("\|", title)[1]

def get_species_id(title):
    """
    >>> get_species_id(">VT42806|1|dkik|-|2561")
    'dkik'
    """
    return re.split("\|", title)[2]

In [4]:
for file in all_files:
    region_id = get_region_id(file) # Get the region id of the current file
    
    file_path = data_path + file
    data = open(file_path, "r").read()
    
    data_for_species = re.split("\n+", data) # Split by lines to find the list of species
    
    all_species = []
    function_indicators = []
    for line in data_for_species:
        if detect_title(region_id, line):
            species_id = get_species_id(line)
            indicator = get_indicator(line)
            all_species.append(species_id)
            function_indicators.append(indicator)
    
    assert len(all_species) == 24
    
    # Split by title to extrac the sequence of each species:
    data_for_sequence = re.split(">[A-Za-z0-9]*\|[A-Za-z0-9]\|[A-Za-z0-9]*\|.\|[0-9]*", data)
    # Capitalize all the letters and remove "\n" in sequences and possible empty string
    all_sequences = [re.sub("\n", "", seq.upper()) for seq in data_for_sequence if seq != ""]
    
    row_name = [region_id + "." + species_id for species_id in all_species]
    result = pandas.DataFrame(row_name, columns = ["region_id.special_id"])
    result["expression"] = function_indicators
    for i in range(len(motifs)):
        counts_for_each_species = []
        
        words_filename = words_filename_prefix + motifs[i]
        with open(words_path + words_filename, "rb") as fo:
            words = pickle.load(fo, encoding = "bytes")
        
        for j in range(len(all_species)):
            count = 0
            for w in words:
                count += len(re.findall(w, all_sequences[j]))
            counts_for_each_species.append(count)
        result[motifs[i]] = counts_for_each_species
        
    print("Result for " + file)
    print(result)
    print("\n")

Result for outlier_rm_with_length_VT1592.fa
   region_id.special_id expression  bcd  cad  gt  hb
0       VT1592.MEMB002A          1    0    0   0   0
1       VT1592.MEMB002C          1    0    0   0   0
2       VT1592.MEMB002F          1    0    0   0   0
3       VT1592.MEMB004A          1    0    0   0   0
4       VT1592.MEMB005B          1    1    1   0   0
5       VT1592.MEMB005D          1    1    0   0   0
6       VT1592.MEMB007B          1    0    0   0   0
7           VT1592.dkik          1    0    0   0   0
8       VT1592.MEMB002B          1    0    0   0   0
9       VT1592.MEMB002D          1    0    0   0   0
10      VT1592.MEMB002E          1    0    0   0   1
11      VT1592.MEMB003A          1    0    0   0   1
12      VT1592.MEMB003B          1    1    0   0   0
13      VT1592.MEMB003C          1    0    0   0   0
14      VT1592.MEMB003D          1    0    0   0   0
15      VT1592.MEMB003F          1    0    0   0   0
16      VT1592.MEMB004B          1    1    0   0   0
17

Result for outlier_rm_with_length_VT1404.fa
   region_id.special_id expression  bcd  cad  gt  hb
0       VT1404.MEMB002A          1    0    0   0   0
1       VT1404.MEMB002C          1    0    0   0   0
2       VT1404.MEMB002D          1    0    0   0   0
3       VT1404.MEMB002F          1    0    0   0   0
4       VT1404.MEMB003B          1    0    0   0   0
5       VT1404.MEMB003F          1    0    0   0   0
6       VT1404.MEMB004A          1    0    0   0   0
7       VT1404.MEMB004E          1    0    0   0   0
8       VT1404.MEMB006C          1    0    0   0   0
9       VT1404.MEMB007B          1    0    0   0   0
10      VT1404.MEMB007C          1    0    0   0   0
11      VT1404.MEMB007D          1    0    0   0   0
12          VT1404.dkik          1    0    0   0   0
13      VT1404.MEMB002B          1    0    0   0   0
14      VT1404.MEMB002E          1    0    0   0   0
15      VT1404.MEMB003A          1    0    0   0   0
16      VT1404.MEMB003C          1    0    0   0   0
17

Result for outlier_rm_with_length_VT0850.fa
   region_id.special_id expression  bcd  cad  gt  hb
0           VT0850.dkik          1    1    0   0   0
1       VT0850.MEMB002A          1    1    0   0   0
2       VT0850.MEMB002B          1    1    0   0   0
3       VT0850.MEMB002C          1    1    0   0   0
4       VT0850.MEMB003A          1    0    0   0   0
5       VT0850.MEMB003C          1    1    0   0   0
6       VT0850.MEMB003D          1    0    0   0   0
7       VT0850.MEMB004A          1    1    0   0   0
8       VT0850.MEMB004B          1    1    0   0   0
9       VT0850.MEMB005B          1    0    0   0   0
10      VT0850.MEMB007C          1    0    0   0   0
11      VT0850.MEMB002D          1    0    0   0   0
12      VT0850.MEMB002E          1    0    0   0   0
13      VT0850.MEMB002F          1    1    0   0   0
14      VT0850.MEMB003B          1    0    0   0   0
15      VT0850.MEMB003F          1    0    0   0   0
16      VT0850.MEMB004E          1    0    0   0   0
17