**Author:** Zhanyuan Zhang  
**Purpost:** This notebook counts the frequency of each motif in each region for the total 24 species.  
**Note:** For demonstration and debugging purposes, this notebook only shows the result of 20 regions.  

In [1]:
import os, re, pickle, pandas

In [2]:
data_path = "/home/zhanyuan/uc-berkeley/discoverydna/team_neural_network/data/input/test/"
words_path = "/home/zhanyuan/uc-berkeley/discoverydna/team_neural_network/data/input/jaspar_pwm/words/"
output_path = "/home/zhanyuan/uc-berkeley/discoverydna/team_neural_network/data/input/test/results/"
all_files = os.listdir(data_path)
all_files = [file for file in all_files if file != "results"]
words_files = os.listdir(words_path)

words_filename_prefix = "words_generated_by_"
motifs = ["bcd", "cad", "gt", "hb"]


In [3]:
# Define a bunch of helper functions

def get_region_id(filename):
	"""
	>>> get_region_id('outlier_rm_with_length_VT55793.fa')
	'VT55793'
	"""
	return re.findall("[A-Z0-9]+", filename)[0]

def detect_title(region_id, line):
	"""
	>>> region_id = re.compile("VT42806")
	>>> detect_title(region_id, '>VT42806|1|dkik|-|2561')
	True
	"""
	return len(re.findall(region_id, line)) != 0

def get_species_id(title):
	"""
	>>> get_species_id(">VT42806|1|dkik|-|2561")
	'dkik'
	"""
	return re.split("\|", title)[2]

class Sequence():
	"""
    name: species id
    seq: sequance of the species for a given region
    """
	def __init__(self, name, seq):
		self.name = name
		self.seq = seq

	def get_name():
		return self.name

	def get_seq():
		return self.seq
    

In [4]:
for file in all_files:
    region_id = get_region_id(file) # Get the region id of the current file
    
    file_path = data_path + file
    data = open(file_path, "r").read()
    
    data_for_species = re.split("\n+", data) # Split by lines to find the list of species
    
    all_species = []
    for line in data_for_species:
        if detect_title(region_id, line):
            species_id = get_species_id(line)
            all_species.append(species_id)
    
    assert len(all_species) == 24
    
    # Split by title to extrac the sequence of each species:
    data_for_sequence = re.split(">[A-Za-z0-9]*\|[A-Za-z0-9]\|[A-Za-z0-9]*\|.\|[0-9]*", data)
    data_for_sequence = [re.sub("\n", "", seq.upper()) for seq in data_for_sequence if seq != ""]
    
    Seq_list = [] # A list of Sequance object
    for i in range(len(all_species)):
        Seq_list.append(Sequence(all_species[i], data_for_sequence[i]))
    
    result = pandas.DataFrame(all_species, columns = ["special_id"])
    for i in range(len(motifs)):
        counts_for_each_species = []
        
        words_filename = words_filename_prefix + motifs[i]
        with open(words_path + words_filename, "rb") as fo:
            words = pickle.load(fo, encoding = "bytes")
        
        for j in range(len(all_species)):
            count = 0
            for w in words:
                count += len(re.findall(w, Seq_list[j].seq))
            counts_for_each_species.append(count)
        result[motifs[i]] = counts_for_each_species
        
    print("Result for " + file)
    print(result)
    print("\n")

Result for outlier_rm_with_length_VT1592.fa
   special_id  bcd  cad  gt  hb
0    MEMB002A    0    0   0   0
1    MEMB002C    0    0   0   0
2    MEMB002F    0    0   0   0
3    MEMB004A    0    0   0   0
4    MEMB005B    1    1   0   0
5    MEMB005D    1    0   0   0
6    MEMB007B    0    0   0   0
7        dkik    0    0   0   0
8    MEMB002B    0    0   0   0
9    MEMB002D    0    0   0   0
10   MEMB002E    0    0   0   1
11   MEMB003A    0    0   0   1
12   MEMB003B    1    0   0   0
13   MEMB003C    0    0   0   0
14   MEMB003D    0    0   0   0
15   MEMB003F    0    0   0   0
16   MEMB004B    1    0   0   0
17   MEMB004E    0    0   0   0
18   MEMB006A    0    0   0   0
19   MEMB006B    0    0   0   0
20   MEMB006C    1    0   0   0
21   MEMB007C    1    0   0   0
22   MEMB007D    1    0   0   0
23   MEMB008C    1    0   0   0


Result for outlier_rm_with_length_VT0870.fa
   special_id  bcd  cad  gt  hb
0    MEMB002E    0    0   0   0
1    MEMB003C    2    0   0   0
2    MEMB003D 

Result for outlier_rm_with_length_VT1404.fa
   special_id  bcd  cad  gt  hb
0    MEMB002A    0    0   0   0
1    MEMB002C    0    0   0   0
2    MEMB002D    0    0   0   0
3    MEMB002F    0    0   0   0
4    MEMB003B    0    0   0   0
5    MEMB003F    0    0   0   0
6    MEMB004A    0    0   0   0
7    MEMB004E    0    0   0   0
8    MEMB006C    0    0   0   0
9    MEMB007B    0    0   0   0
10   MEMB007C    0    0   0   0
11   MEMB007D    0    0   0   0
12       dkik    0    0   0   0
13   MEMB002B    0    0   0   0
14   MEMB002E    0    0   0   0
15   MEMB003A    0    0   0   0
16   MEMB003C    0    0   0   0
17   MEMB003D    0    0   0   0
18   MEMB004B    0    0   0   0
19   MEMB005B    0    0   0   0
20   MEMB005D    0    0   0   0
21   MEMB006A    0    0   0   0
22   MEMB006B    0    0   0   0
23   MEMB008C    0    0   0   0


Result for outlier_rm_with_length_VT1483.fa
   special_id  bcd  cad  gt  hb
0    MEMB002A    0    0   0   1
1    MEMB002C    0    0   0   1
2    MEMB003A 