# A project in bioinformatics

## Part 1

In [26]:
from collections import Counter
from collections import defaultdict

import os
WORKINNG_DIR = os.getcwd()

WORDS_PLASMID_TXT = os.path.join(WORKINNG_DIR, r"cog_words_plasmid.txt")
WORDS_BAC_TXT = os.path.join(WORKINNG_DIR, r"cog_words_bac.txt")


### 1.1. Find cog problem: Finding the genomes in which the cog appears
The function receives a genome and check if the cog appears in it.

- Input: genome, HashMap of wordes, cog.

In [27]:
# take one row from DB and check if cog appears in it
def find_cog(data, cog_map, cog):
    for row in data:
        for i in range(5, len(row)):
            if cog == row[i]:
                find_words(cog_map, row, i)  # find all neighbors of cog
                break

### 1.2. Find Wordes: Finding all of cog's neighbors
The function divides the genome into l-length segments and checks if it appears in each segment.
If so, it puts the word in the Hashmap

- Input: HashMap of words, genome, index of cog.

In [28]:
# find all words (cog+neighbors) in a specific row
def find_words(cog_map, row, i):
    for l_param in range(2, 11):  # number of neighbours 2-10
        if l_param <= len(row) - 5:  # 5 is the index of the first cog in the row
            for j in range(5, len(row) - l_param + 1):
                word = tuple(row[j: j + l_param])  # tuple is hashable (the length of word is l_param)
                if row[i] in word:  # if the cog is in the word
                    if word in cog_map:  # if the word is already in the hashmap
                        value_of_word = cog_map[word]
                        value_of_word[0] += 1
                        organism = row[3]  # row[3] is the organism's name
                        value_of_word[1].add(organism)  # add the organism's name to the set
                        cog_map[word] = value_of_word  # set to the map the new value of key 'word'
                    else:
                        cog_map[word] = [1, {row[3]}]  # add a new key 'word' to the map

### 1.3. bigger_than_q: Find Wordes that are  conserved in ≥ 𝑞 of the genomes
The function receives a map of all the words and checks if the size of its group of organisms is greater than q.
If so, it puts it in a counter with the size

- Input: HashMap of words, q, counter that save the wordes and the number genomes in which it appears.

In [29]:
# filter the map and only consider words that appear in at least q different organisms
def bigger_than_q(cog_map, q, counter):
    for key in cog_map:
        value = cog_map[key]
        if len(value[1]) >= q:  # check if the size of the organisms set is at least q
            counter[key] = len(value[1])


### 1.4. sort_output: Sort the words in the counter by length and q
The function receives a counter, sorts it into sub-list according to the lengths of the word and sort each list by decreasing number of genomes in which the word was found 

- Input: counter
- output: sort list by length and q 

In [30]:
def sort_output(counter):
    output = [] 
    # Inserting the words into the output by the number of their appearances in the genome, in descending order
    for k, v in counter.most_common():
        output.append(k)

    group = defaultdict(list)
    # sort the output by the length
    for c in output:
        group[len(c)].append(c)
    return group

### main

In [40]:
# open, read and split both of the files. return the words that contain the cog and appear in at least q organisms
def main(cog, q):
    # first file - plasmid
    with open(WORDS_PLASMID_TXT, "rb") as f:
        data1 = f.readlines()  # read content as lines

    # split each line by tab (\t) and remove the new line at the last cell
    split1 = [x.split(b"\t")[:-1] for x in data1]

    # split the first cell by hashtag (#) and combine with the rest of the list
    data1 = [list([*x[0].split(b"#"), *x[1:]]) for x in split1]

    counter = Counter() # create a Counter with key = word, value = the size of the group that the organism appears in it (q)
    cog_map = {}  # create a hashmap with the word (a tuple) as a key and array of [number_of_appears, set_of_organism] as the value
    find_cog(data1, cog_map, cog)  # a map of words that contain the cog from the first DB

    # open second file - bacteria
    with open(WORDS_BAC_TXT, "rb") as f:
        data2 = f.readlines()  # read content as lines

    # split each line by tab (\t) and remove the new line at the last cell
    split2 = [x.split(b"\t")[:-1] for x in data2]

    # split the first cell by hashtag (#) and combine with the rest of the list
    data2 = [list([*x[0].split(b"#"), *x[1:]]) for x in split2]

    find_cog(data2, cog_map, cog)  # a map of words that contain the cog from both DB
    bigger_than_q(cog_map, q, counter) # update the counter withe wordes that the value bigger than q
    
    sorted_groups = sort_output(counter) # a  sort list of wordes that appear bigger than q
    print(sorted_groups)

In [41]:
cog = b"0121" 
main(cog, 20)

defaultdict(<class 'list'>, {2: [(b'0279', b'0121'), (b'0121', b'4301'), (b'1262', b'0121'), (b'X', b'0121'), (b'0121', b'X')], 3: [(b'1262', b'0121', b'4301'), (b'3572', b'1262', b'0121'), (b'0121', b'4301', b'0520')], 4: [(b'3572', b'1262', b'0121', b'4301'), (b'1262', b'0121', b'4301', b'0520')], 5: [(b'3572', b'1262', b'0121', b'4301', b'0520')]})
