**Author:** Zhanyuan Zhang, and Thomas
**Purpose:** This notebook counts the frequency of each type of motif in each region for 24 species.  
**Note:**  
   * For demonstration and debugging purposes, this notebook only shows the result of 20 regions. 
   * This notebook works on 20 .fa files in a folder I created locally called *test*, which is seperated from the *3.24_species_only*, the original file. One may need to take this extra step before using this notebook to avoid touching all the 3,543 .fa files in *3.24_species_only*.

In [24]:
import os, re, pickle, pandas

In [25]:
# path of the FASTA files
data_path = "C:/Users/Lanes/ResearchLab/team_neural_network/data/input/test/"

# path of words generated by motifs' pwm
words_path = "C:/Users/Lanes/Downloads/"

all_files = os.listdir(data_path)
# Make sure that the file list only contains .fa files that we need.
all_files = [file for file in all_files if file != "results"]

# The buffer of the python list of words named by "words_generated_by_[replace with motif name]"
words_filename_prefix = "words_generated_by_" 
motifs = ["motif_1_fasta.txt"]
print(all_files)


['align_outlier_rm_with_length_VT0809.fa', 'align_outlier_rm_with_length_VT0845.fa', 'align_outlier_rm_with_length_VT0849.fa', 'align_outlier_rm_with_length_VT0850.fa', 'align_outlier_rm_with_length_VT0851.fa', 'align_outlier_rm_with_length_VT0868.fa', 'align_outlier_rm_with_length_VT0870.fa', 'align_outlier_rm_with_length_VT0871.fa', 'align_outlier_rm_with_length_VT0875.fa', 'align_outlier_rm_with_length_VT0877.fa', 'align_outlier_rm_with_length_VT0985.fa', 'align_outlier_rm_with_length_VT1219.fa', 'align_outlier_rm_with_length_VT1298.fa', 'align_outlier_rm_with_length_VT1404.fa', 'align_outlier_rm_with_length_VT1418.fa', 'align_outlier_rm_with_length_VT1419.fa', 'align_outlier_rm_with_length_VT1483.fa', 'align_outlier_rm_with_length_VT1488.fa', 'align_outlier_rm_with_length_VT1592.fa', 'align_outlier_rm_with_length_VT1594.fa', 'align_outlier_rm_with_length_VT1595.fa', 'align_outlier_rm_with_length_VT1600.fa', 'align_outlier_rm_with_length_VT1608.fa', 'align_outlier_rm_with_length_VT1

In [26]:
# Define a bunch of helper functions

def get_region_id(filename):
    """
    >>> get_region_id('outlier_rm_with_length_VT55793.fa')
    'VT55793'
    """
    return re.findall("[A-Z0-9]+", filename)[0]

def detect_title(region_id, line):
    """
    >>> region_id = re.compile("VT42806")
    >>> detect_title(region_id, '>VT42806|1|dkik|-|2561')
    True
    """
    return len(re.findall(region_id, line)) != 0

def get_indicator(title):
    """
    >>> get_species_id(">VT42806|1|dkik|-|2561")
    '1'
    """
    return re.split("\|", title)[1]

def get_species_id(title):
    """
    >>> get_species_id(">VT42806|1|dkik|-|2561")
    'dkik'
    """
    return re.split("\|", title)[2]

In [27]:
for file in all_files:
    region_id = get_region_id(file) # Get the region id of the current file
    
    file_path = data_path + file
    data = open(file_path, "r").read()
    
    data_for_species = re.split("\n+", data) # Split by lines to find the list of species
    
    all_species = []
    function_indicators = []
    for line in data_for_species:
        if detect_title(region_id, line):
            species_id = get_species_id(line)
            indicator = get_indicator(line)
            all_species.append(species_id)
            function_indicators.append(indicator)
    
    assert len(all_species) == 24
    
    # Split by title to extrac the sequence of each species:
    data_for_sequence = re.split(">[A-Za-z0-9]*\|[A-Za-z0-9]\|[A-Za-z0-9]*\|.\|[0-9]*", data)
    # Capitalize all the letters and remove "\n" in sequences and possible empty string
    all_sequences = [re.sub("\n", "", seq.upper()) for seq in data_for_sequence if seq != ""]
    
    row_name = [region_id + "." + species_id for species_id in all_species]
    result = pandas.DataFrame(row_name, columns = ["region_id.special_id"])
    result["expression"] = function_indicators
    for i in range(len(motifs)):
        counts_for_each_species = []
        
        words_filename = words_filename_prefix + motifs[i]
        with open(words_path + words_filename, "rb") as fo:
            words = pickle.load(fo, encoding = "bytes")
        
        for j in range(len(all_species)):
            count = 0
            for w in words:
                count += len(re.findall(w, all_sequences[j]))
            counts_for_each_species.append(count)
        result[motifs[i]] = counts_for_each_species
        
    print("Result for " + file)
    print(result)
    print("\n")

Result for align_outlier_rm_with_length_VT0809.fa
   region_id.special_id expression  motif_1_fasta.txt
0       VT0809.MEMB005D          1                  0
1       VT0809.MEMB006A          1                  0
2       VT0809.MEMB007D          1                  0
3       VT0809.MEMB002F          1                  0
4       VT0809.MEMB007B          1                  0
5       VT0809.MEMB003C          1                  0
6       VT0809.MEMB002C          1                  0
7       VT0809.MEMB002A          1                  0
8       VT0809.MEMB002B          1                  0
9       VT0809.MEMB004A          1                  0
10      VT0809.MEMB002E          1                  0
11      VT0809.MEMB006C          1                  0
12      VT0809.MEMB004E          1                  0
13      VT0809.MEMB004B          1                  0
14      VT0809.MEMB008C          1                  0
15      VT0809.MEMB003F          1                  0
16      VT0809.MEMB005B         

Result for align_outlier_rm_with_length_VT0985.fa
   region_id.special_id expression  motif_1_fasta.txt
0       VT0985.MEMB002F          1                  0
1       VT0985.MEMB002B          1                  0
2       VT0985.MEMB004A          1                  0
3       VT0985.MEMB007B          1                  0
4       VT0985.MEMB003C          1                  0
5       VT0985.MEMB002C          1                  0
6       VT0985.MEMB002A          1                  0
7       VT0985.MEMB007D          1                  0
8       VT0985.MEMB002E          1                  0
9       VT0985.MEMB002D          1                  0
10      VT0985.MEMB005D          1                  0
11      VT0985.MEMB006A          1                  0
12      VT0985.MEMB006B          1                  0
13      VT0985.MEMB003B          1                  0
14      VT0985.MEMB007C          1                  0
15      VT0985.MEMB003A          1                  0
16          VT0985.dkik         

Result for align_outlier_rm_with_length_VT1594.fa
   region_id.special_id expression  motif_1_fasta.txt
0       VT1594.MEMB002D          1                  0
1       VT1594.MEMB007D          1                  0
2       VT1594.MEMB002E          1                  0
3       VT1594.MEMB006C          1                  0
4       VT1594.MEMB004B          1                  0
5       VT1594.MEMB005B          1                  0
6       VT1594.MEMB004E          1                  0
7       VT1594.MEMB008C          1                  0
8       VT1594.MEMB005D          1                  0
9       VT1594.MEMB006A          1                  0
10      VT1594.MEMB003D          1                  0
11          VT1594.dkik          1                  0
12      VT1594.MEMB003A          1                  0
13      VT1594.MEMB003F          1                  0
14      VT1594.MEMB006B          1                  0
15      VT1594.MEMB003B          1                  0
16      VT1594.MEMB007C         

Result for align_outlier_rm_with_length_VT1839.fa
   region_id.special_id expression  motif_1_fasta.txt
0       VT1839.MEMB002F          1                  0
1       VT1839.MEMB002B          1                  0
2       VT1839.MEMB004A          1                  0
3       VT1839.MEMB007B          1                  0
4       VT1839.MEMB002A          1                  0
5       VT1839.MEMB002C          1                  0
6       VT1839.MEMB003C          1                  0
7       VT1839.MEMB007D          1                  0
8       VT1839.MEMB003D          1                  0
9           VT1839.dkik          1                  0
10      VT1839.MEMB003A          1                  0
11      VT1839.MEMB002D          1                  0
12      VT1839.MEMB002E          1                  0
13      VT1839.MEMB007C          1                  0
14      VT1839.MEMB003B          1                  0
15      VT1839.MEMB006A          1                  0
16      VT1839.MEMB005D         