In [6]:
# Goal of BioCode: To create a Phylogenic Tree using data from GenBank and use the excel sheet as the overlay

import csv
import pandas as pd
import numpy as np
import sys
import os
import ssdeep as ss
import hashlib,binascii
import tensorflow as tf
import xlsxwriter
from sklearn import preprocessing
from keras.layers import Input, Dense
from keras.models import Model, Sequential
from Bio import SeqIO
from Bio import Phylo
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt

class organism:
    
    def __init__(self, name, hashVal):
        self.name = name
        self.hashVal = hashVal

# This class variable is for the list later to contain the 4 letter sequences (Not necessary here)
#class nucleotideSequence:
    
#    def __init__(self, seq):
#        self.seq = seq
    

# This function will read the columns from the excel file and delete NaN value columns (Not necessary here)
# input: file=name of the file, dropColumns=Which columns to drop (default is none), debug=print values (default is 0)
# return: df=dataframe created from the excel file
def convertExcelToDataFrame(file, dropColumns=None, debug=0):

    raw_data = pd.read_excel(file)
    df = pd.DataFrame(raw_data)
    df = df.dropna(axis="columns", how="all")

    if dropColumns != None:
        df = df.drop(dropColumns, axis=1)
        
    df = df.dropna(axis=0)
    
    if debug == 1:
        print(df)

    return df

# This function will take a dataframe and convert the string data into numeric data (Not necessary here)
# Unused function
def oneHotEncodeData():

    df_categorical = df[["Strain", "Genome size"]]
                
    df_numeric = df.select_dtypes(exclude=[object])

    encode = preprocessing.OneHotEncoder(handle_unknown="ignore")
    encode.fit(df_categorical)
    one_hot_labels = encode.transform(df_categorical).toarray()

    all_data = np.concatenate((one_hot_labels, df_numeric.values), axis=1)    
    
# This function will read the fasta files given and returns a hash value for each file
# input: files=array of files given, debug=print values (default is 0)
def readFastaFiles(files, debug=0):
    
    if debug == 1:
        print(files)
        print("")
        
    organisms = []
    sequenceString = ""
    
    if type(files) != list:
        print("Error, Expected a list but was given %s" % (type(files) ) )
        sys.exit()

    try:
        for file in files:
            for record in SeqIO.parse(file, "fasta"):
                name = ""
                sequenceString = ""
                for letter in record.seq:
                    sequenceString+=letter
                for letter in record.id:
                    name+=letter
                hashVal = ss.hash(sequenceString)
                organisms.append(organism(name, hashVal) );
    
    except ValueError as e:
        print("Value error ({0}): {1}".format(e.errno, e.strerror))
    except IOError as e:
        print("I/O error ({0}): {1}".format(e.errno, e.strerror))
    
    return organisms


# This function will create a list out of all files within the directory given
def retrieveFastaFiles(folder):
    fileList = []
    
    for file in os.listdir(folder):
        fileList.append(folder + file)
    
    return fileList

# Create an XML file using an organism's name and hash value of genetic data
def createXMLFile(organisms, fileName):
    
    row = 0
    col = 0
    
    workbook = xlsxwriter.Workbook(fileName)
    worksheet = workbook.add_worksheet()
    
    for organismm in organisms:
        worksheet.write(row, col, organismm.name)
        worksheet.write(row, col+1, organismm.hashVal)
        row+=1
        
    workbook.close()

# Task for building the phylogenetic tree:
# 1) Create a matrix using comparisons of the different hashes DONE
# 2) Numbers will be 0-100, determine which organisms will be connected based off similarity
# 3) Use Scipy from the information gathered to create the tree
    
def identityMatrixOfHashes(dataFrame, debug=0):
    
    i = 0
    j = dataFrame.shape[0]
    arrayOfHashes = []
    matrix = [ [ 0 for i in range(dataFrame.shape[0]) ] for j in range(dataFrame.shape[0]) ]
    
    for i in range(0, dataFrame.shape[0]):
        #print(dataFrame.iloc[i]['Hash Values'])
        arrayOfHashes.append(dataFrame.iloc[i]['Hash Values'])
    
    
    
    if debug == 1:
        print(arrayOfHashes)
    
    for i in range(0, dataFrame.shape[0]):
        for j in range(0, dataFrame.shape[0]):
            matrix[i][j] = ss.compare(arrayOfHashes[i], arrayOfHashes[j])
            
    if debug == 1:
        for array in matrix:
            print(array)
            
    return matrix
        
def scipyTree(identityMatrix):
    
    Z = linkage(identityMatrix, 'ward')
    fig = plt.figure(figsize=(25, 10) )
    dn = dendrogram(Z)
    
    Z = linkage(identityMatrix, 'single')
    fig = plt.figure(figsize=(25, 10) )
    dn = dendrogram(Z)
    plt.show()
    

In [9]:
fileList = retrieveFastaFiles("./allFixedFastaFiles/")

organisms = readFastaFiles(fileList)

createXMLFile(organisms)
