# Sequence Extraction from HoloBee Database v2016.1

Import python packages

In [None]:
import os
import re
import Bio.SeqIO as sio
import pandas as pd

In the function *os.listdir()*, add the file path to the folder containing the **.fasta** files *(HB_Mop_v2016.1.fasta, HB_Bar_v2016.1.fasta)*

In [None]:
input_filenames = os.listdir(path = 'C:/Users/vishwakarmas/Downloads/HB_v2016.1/HB_v2016.1')
input_name = [input_file for input_file in input_filenames if re.match(r"(^.*.fasta)$", input_file, flags = re.IGNORECASE)]
print(input_name)

A *for* loop is used to perform iterations to the files that end with **.fasta** and pull Descriptions from *sequence id* using a parse function from *Bio.SeqIO.parse()*

In [None]:
def split_seq(sequence):
    return sequence.split("|")

def split_comma(sequence):
    return sequence[(len(sequence) - 1)].split(",")

In [None]:
for input_file in input_name:
    seq_list_obj = sio.parse(input_file, "fasta")
    seq_de = [item.description for item in seq_list_obj]
    
    seq_de_split = [split_seq(seq) for seq in seq_de]
    info_type = [split_comma(seq) for seq in seq_de_split]
    seq_info = [seq[0].strip() for seq in info_type]
    
    seq_seq = [item[1].strip() if len(item) > 1 else "-" if len(item) == 1 else "Description Unknown" for item in info_type]

    seq_id = [seq.split(" ")[0] for seq in seq_de]
    
    df = pd.DataFrame(data = {"Sequence_ID": seq_id, "Description": seq_info, "Sequencing": seq_seq})
        
    output_name = str(input_file.split(".")[0]) + "_Sequence_Info_Jupyter.csv"
    print(output_name)
        
    df.to_csv(output_name, sep = ',', index = False)

    seq_list_obj.close()

# Using plotly

Import packages

In [None]:
import os
import re
import pandas as pd
import plotly.express as px

Find the files that were created from the above data manipulation

In [None]:
file_path = os.listdir(path = 'C:/Users/vishwakarmas/Downloads/HB_v2016.1/HB_v2016.1/')
files = [input_file for input_file in file_path if re.match(r"(^.*_Sequence_Info.csv)$", input_file, flags = re.IGNORECASE)]
print(files)

Select a csv file from the output above to read.

In [None]:
vis_info = pd.read_csv("HB_Bar_v2016_Sequence_Info.csv")

Split the *Description* column into 3 elements and concatentate the first two elements. This will help create a brief overview of the species in the .csv file. Add the output into the *vis_info* dataframe.

In [None]:
vis_info.insert(2, "Group", value = [(element[0] + " " + element[1]) for element in vis_info['Description'].str.split(" ", n = 2).to_list()])

Create a Series with the grouped counts

In [None]:
group_counts = vis_info['Group'].value_counts()

Create a plotly figure

In [None]:
fig = px.bar(group_counts, x = group_counts.index, y = group_counts, 
             title="Frequency of Sequences Present",
             labels={
                     "y": "Frequency",
                     "index": "Gene Description"
             },
             height=500)

***The plot may have a large number of x values, in order to see all the values, use'pan'.***

In [None]:
fig.show()

# View table

Import packages

In [None]:
import plotly
import plotly.figure_factory as ff

Create a new dataframe by transferring the index from the *group_df* **Series** into a new column.

In [None]:
group_df = group_counts.to_frame().reset_index()
table = ff.create_table(group_df)

**View the table.**

In [None]:
plotly.offline.iplot(table, filename='jupyter-table1')

To view in alphabetical order, run the following:

In [None]:
table_sorted = ff.create_table(group_df.sort_values(by = ['index']))

In [None]:
plotly.offline.iplot(table_sorted, filename='jupyter-table1')