# External Programs Integated with Pub_worm

* Look at external programs and use pub_worm to solve problems
* Look at BioPythons use and API

# BioPython

In [None]:
import os
from Bio import Entrez
from bs4 import BeautifulSoup

Entrez.api_key = api_key = os.environ.get('NCBI_API_KEY', None)
Entrez.email = "daniel.higgins@yahoo.com"

# Call BioPython get a list of Databases that are managed by NCBI Entrez
stream = Entrez.einfo()
result = stream.read()
stream.close()

soup = BeautifulSoup(result, "xml")
db_name_tags = soup.find_all('DbName')
db_names = [db_name_tag.get_text(strip=True) for db_name_tag in db_name_tags]
print(db_names)


In [None]:
# Use Bio Python Entrez to get additional details on the NCBI Databases
# NOTE: This seems very slow??
for db_name in db_names:
    stream = Entrez.einfo(db=db_name)
    record = Entrez.read(stream)
    print(record)
    print("="*40)


# LangChain

# Apendix

In [None]:
# Build a networkx graph 

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

def build_legend_map(edge_labels):
    legend_map={'start_xxx':0}
    for value in list(edge_labels.values()):
        if value not in legend_map:
            max_val = max(list(legend_map.values()))
            legend_map[value] = max_val+1
    del legend_map['start_xxx']
    return legend_map

def plot_network_graph(df):

    # Create a directed graph
    G = nx.from_pandas_edgelist(df, 'source', 'target', edge_attr='edge', create_using=nx.Graph())

    # Draw the network diagram with a larger figure size
    plt.figure(figsize=(40, 20))  # Set the figure size to 12x8 inches
    pos = nx.spring_layout(G)  # positions for all nodes
    nx.draw(G, pos, with_labels=True, node_size=4000, node_color='skyblue', font_size=9, font_color='black', edge_color='gray', linewidths=0.5, arrows=False)

    # Add edge labels
    edge_labels = nx.get_edge_attributes(G, 'edge')
    print(type(edge_labels))
    # print(edge_labels)
    # legend_map = build_legend_map(edge_labels)
    # edge_labels_mapped = {}
    # for edge_label in edge_labels:
    #     edge_label_value = edge_labels[edge_label]
    #     map_value = legend_map[edge_label_value]
    #     edge_labels_mapped[edge_label]=map_value
    # print(legend_map)
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)

    # y_pos = 0.0
    # for key, value in legend_map.items():
    #     plt.text(0.0, y_pos, f"{value} = {key}", fontsize=12)
    #     y_pos -= 0.1  # Adjust the y position for the next text


    plt.show()


In [None]:
# Load the CSV file into a DataFrame
df = pd.read_csv('output/test.csv')
edge_names = [
    'Phenols',
    'Lactones',
    'Organic carbonic acids and derivatives',
    '6-oxopurines'
]
selected_rows = df[df['edge'].isin(edge_names)]
selected_rows.to_csv("output/test1.csv")
plot_network_graph(selected_rows)

In [None]:
import pandas as pd

slim_metabolite_df = pd.read_csv("output/slim_metabolite.csv")
slim_metabolite_t_df = slim_metabolite_df.T
slim_metabolite_t_df.to_csv("output/slim_motabolite_t.csv",index_label='motabolite')
slim_metabolite_t_df

In [None]:
import pandas as pd


# Iterate over each row and create a list of dictionaries
list_of_dicts = []
for idx, row in slim_metabolite_t_df.iterrows():
    cleaned_row = row.dropna().tolist()
    row_dict = {str(idx): cleaned_row}
    list_of_dicts.append(row_dict)

# Print the list of dictionaries
print(list_of_dicts)


In [None]:
edges_to_ignore = ['Chemical entities', 'Hydrocarbon derivatives', 'Organic compounds', 'Organic oxygen compounds', 
                   'Organooxygen compounds', 'Organic oxides', 'Organic acids and derivatives', 'Organonitrogen compounds', 
                   'Organopnictogen compounds', 'Organic nitrogen compounds']
def shares_edge(source_edge, target):
    ret_val=False
    target_nm = list(target.keys())[0]
    target_edges = target[target_nm]
    if source_edge in target_edges:
        ret_val = True
    return ret_val


graph_list = []
for index, list_of_dict in enumerate(list_of_dicts):
    source_nm = list(list_of_dict.keys())[0]
    source_edges  = list_of_dict[source_nm]
    #print(f"{source_nm=} {source_edges=}")
    targets = list_of_dicts[index+1:]
    for source_edge in source_edges:
        if source_edge not in edges_to_ignore:
            for target in targets:
                target_nm = list(target.keys())[0]
                if shares_edge(source_edge, target):
                    graph_list.append({'source':source_nm,'target':target_nm,'edge':source_edge})

graph_df = pd.DataFrame(graph_list)
graph_df.head()
graph_df.to_csv("output/test.csv",index=False)