In [22]:
import pandas as pd
import json
from IPython.display import HTML

# Function to read the TSV file
def read_tsv(file_path):
    try:
        # Read the TSV file into a DataFrame
        df = pd.read_csv(file_path, sep='\t')
        return df
    except Exception as e:
        return str(e)

# Function to get a row based on article DOI and return specified columns
def get_row_by_doi(df, doi, columns):
    # Ensure 'article doi' matches your DataFrame's column name
    row = df[df['article doi'] == doi]

    # Return only the specified columns
    return row[columns]

# Placeholder columns
columns = ['cell shape', 'mean length (microns)', 'mean width (microns)', 'cell aggregation', 'gram status']

# File path to your TSV file
file_path = 'IJSEM_pheno_db_v1.0_additional_cleaned.tsv'

# Read the TSV file into a DataFrame
df = read_tsv(file_path)
print(df.columns)

# Your original articles data
articles = [
    # {"name": "Streptomyces speibonae", "doi": "10.1099/ijs.0.02341-0"},
    # {"name": "Dialister invisus", "doi": "10.1099/ijs.0.02640-0"},
    # {"name": "Fulvimarina gen.", "doi": "10.1099/ijs.0.02644-0"},
    # {"name": "Salinibacterium amurskyense", "doi": "10.1099/ijs.0.02627-0"},

    {"name": "Belliella baltica", "doi": "10.1099/ijs.0.02752-0"},
    {"name": "Hongiella halophila", "doi": "10.1099/ijs.0.02861-0"},
    {"name": "Nitratireductor aquibiodomus", "doi": "10.1099/ijs.0.02793-0"},
    {"name": "Paenibacillus favisporus", "doi": "10.1099/ijs.0.02709-0"},
    {"name": "Nocardia asiatica", "doi": "10.1099/ijs.0.02676-0"},
    {"name": "Sulfitobacter delicatus", "doi": "10.1099/ijs.0.02654-0"},
    {"name": "Cellulomonas terrae", "doi": "10.1099/ijs.0.63696-0"},
    {"name": "Mycobacterium paraseoulense", "doi": "10.1099/ijs.0.012054-0"},
    {"name": "Providencia sneebia", "doi": "10.1099/ijs.0.000117-0"},
    {"name": "Psychrobacter lutiphocae", "doi": "10.1099/ijs.0.008706-0"}
]

# Update each article with data from the TSV file
for article in articles:
    doi_data = get_row_by_doi(df, article['doi'], columns)
    for col in columns:
        if not doi_data.empty and col in doi_data:
            article[col] = doi_data.iloc[0][col]
        else:
            article[col] = "N/A"

# Remove DOI and create a new list for JSON
articles_json = [{k: v for k, v in article.items() if k != 'doi'} for article in articles]

# Save the new list to a JSON file
json_file_path = 'targets.json'
with open(json_file_path, 'w') as file:
    json.dump(articles_json, file, indent=4)


# Function to convert JSON to HTML table
def json_to_html_table(json_data):
    html = "<table border='1'>"
    html += "<tr>"
    # Table headers
    for key in json_data[0].keys():
        html += "<th>{}</th>".format(key)
    html += "</tr>"
    
    # Table rows
    for item in json_data:
        html += "<tr>"
        for value in item.values():
            html += "<td>{}</td>".format(value)
        html += "</tr>"
    html += "</table>"
    return html

# Convert articles JSON to HTML
html_output = json_to_html_table(articles)

# Display HTML
HTML(html_output)


Index(['Habitat', 'IJSEM year', 'article doi', '16S rDNA accession number',
       'DNA GC content (mol%)', 'oxygen preference', 'mean length (microns)',
       'mean width (microns)', 'motility', 'spore production',
       'Metabolism assays', 'Genus name', 'species name', 'strain name',
       'pH optimum for growth', 'pH range at which growth occurred',
       'temperature optimum for growth (degC)',
       'temperature range at which growth occurred (degC)',
       'optimal NaCl concentration for growth (%)',
       'NaCl concentration range at which growth occurred (%)',
       'pigment production', 'cell shape', 'cell aggregation',
       'article first page', 'culture collection codes',
       'Sole carbon substrate use', 'genome accession number', 'gram status',
       'If 'other' was chosen above, please enter a habitat below',
       'The paper included Biolog results'],
      dtype='object')


name,doi,cell shape,mean length (microns),mean width (microns),cell aggregation,gram status
Belliella baltica,10.1099/ijs.0.02752-0,rod,0.9-3.0,0.3-0.5,clump,negative
Hongiella halophila,10.1099/ijs.0.02861-0,rod,1.1-1.7,0.4-0.5,not indicated,negative
Nitratireductor aquibiodomus,10.1099/ijs.0.02793-0,rod,2.0-3.0,1,not indicated,negative
Paenibacillus favisporus,10.1099/ijs.0.02709-0,rod,02-Mar,0.5-0.7,clump,variable
Nocardia asiatica,10.1099/ijs.0.02676-0,not indicated,,,not indicated,positive
Sulfitobacter delicatus,10.1099/ijs.0.02654-0,rod,1.35,0.7,none,negative
Cellulomonas terrae,10.1099/ijs.0.63696-0,rod,,,not indicated,positive
Mycobacterium paraseoulense,10.1099/ijs.0.012054-0,rod,Not indicated,Not indicated,not indicated,
Providencia sneebia,10.1099/ijs.0.000117-0,rod,,,clump,negative
Psychrobacter lutiphocae,10.1099/ijs.0.008706-0,ovoid/coccobacillus,,,,negative


In [23]:
import pandas as pd
import json

def extract_json_from_csv(file_name, output_column):
    try:
        df = pd.read_csv(file_name)
        if output_column not in df.columns:
            return []
        json_objects = []
        for row in df[output_column]:
            try:
                start = row.find('{')
                end = row.rfind('}') + 1
                if start != -1 and end != -1:
                    json_str = row[start:end]
                    json_obj = json.loads(json_str)
                    json_objects.append(json_obj)
                else:
                    json_objects.append({})
            except Exception as e:
                json_objects.append({})
        return json_objects
    except Exception as e:
        return []

# Provided JSON data
data_json = [
        {"model": "gpt-3.5-turbo", "filename": "output_file_gpt-3.5-turbo.csv"},
        {"model": "gpt-4-0314", "filename": "output_file_gpt-4-0314.csv"},
        {"model": "gpt-4-0613", "filename": "output_file_gpt-4-0613.csv"},
        {"model": "gpt-4-1106-preview", "filename": "output_file_gpt-4-1106-preview.csv"},
        {"model": "llama2-7b-chat", "filename": "output_file_llama2-7b-chat.csv"},
        {"model": "llama2-7b", "filename": "output_file_llama2-7b.csv"},
        {"model": "phi-2", "filename": "output_file_phi-2.csv"}
    ]

output_column = 'Output'  # Replace with the name of your output column

# Process each model
for entry in data_json:
    file_name = entry["filename"]  # Assuming the file is in the same directory
    extracted_json = extract_json_from_csv(file_name, output_column)
    entry["output"] = extracted_json

# Saving the result to a new file
new_file_path = 'processed_model_outputs.json'
with open(new_file_path, 'w') as file:
    json.dump(data_json, file, indent=4)

new_file_path



'processed_model_outputs.json'

In [24]:
import json
import pandas as pd
from IPython.display import HTML

# Load the updated models data
with open('updated_models_output.json', 'r') as file:
    updated_models_data = json.load(file)

# Function to calculate accuracy
def calculate_accuracy(model_data):
    accuracies = {}
    for output in model_data['output']:
        for key, value in output.items():
            # Check if value is a dictionary (to ensure it's an attribute with correctness)
            if isinstance(value, dict):
                if key not in accuracies:
                    accuracies[key] = {'correct': 0, 'total': 0}
                accuracies[key]['total'] += 1
                if value.get('correctness', 0) == 1:
                    accuracies[key]['correct'] += 1

    # Calculate accuracy percentage for each attribute
    for key, value in accuracies.items():
        accuracy_percentage = round(value['correct'] / value['total'] * 100, 2)
        accuracies[key] = f"{accuracy_percentage}%"
    return accuracies

# Create a list for the DataFrame
data_for_df = []

# Process each model's data
for model in updated_models_data:
    model_name = model['model']
    accuracy_data = calculate_accuracy(model)
    accuracy_data['Model Name'] = model_name
    data_for_df.append(accuracy_data)

# Create a DataFrame
df = pd.DataFrame(data_for_df)
df.set_index('Model Name', inplace=True)

# Convert DataFrame to HTML table
html_table = df.to_html()

# Display the HTML table in Jupyter Notebook
HTML(html_table)


Unnamed: 0_level_0,name,cell_shape,cell_arrangement,gram_staining,mean length (microns),mean width (microns)
Model Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gpt-3.5-turbo,100.0%,80.0%,60.0%,70.0%,80.0%,80.0%
gpt-4-0314,100.0%,70.0%,40.0%,70.0%,80.0%,80.0%
gpt-4-0613,100.0%,70.0%,30.0%,70.0%,80.0%,80.0%
gpt-4-1106-preview,100.0%,70.0%,30.0%,70.0%,80.0%,80.0%
llama2-7b-chat,0.0%,30.0%,40.0%,50.0%,50.0%,40.0%
llama2-7b,0.0%,50.0%,30.0%,70.0%,70.0%,60.0%
phi-2,60.0%,30.0%,30.0%,30.0%,40.0%,30.0%


In [30]:
import json
import pandas as pd
from IPython.display import HTML

# Load the updated models data
with open('updated_models_output.json', 'r') as file:
    updated_models_data = json.load(file)

def calculate_average_correctness_for_models(data):
    model_correctness_statistics = []

    for model_data in data:
        model_name = model_data['model']
        total_correctness = 0
        total_fields = 0

        for item in model_data['output']:
            for key, value in item.items():
                if isinstance(value, dict) and 'correctness' in value:
                    total_correctness += value['correctness']
                    total_fields += 1

        average_correctness = total_correctness / total_fields if total_fields > 0 else 0
        model_correctness_statistics.append({'model': model_name, 'average_correctness': average_correctness})

    return model_correctness_statistics

# Calculate correctness statistics by index
correctness_statistics_by_index = calculate_average_correctness_for_models(updated_models_data)


# Prepare HTML table for display with corrected key
def generate_html_table_by_index(data):
    html = "<table><tr><th>Model</th><th>Average Correctness</th></tr>"
    for stat in data:
        html += f"<tr><td>{stat['model']}</td><td>{stat['average_correctness']:.2f}</td></tr>"
    html += "</table>"
    return html

html_table_by_index = generate_html_table_by_index(correctness_statistics_by_index)
display(HTML(html_table_by_index))

Model,Average Correctness
gpt-3.5-turbo,0.78
gpt-4-0314,0.73
gpt-4-0613,0.72
gpt-4-1106-preview,0.72
llama2-7b-chat,0.35
llama2-7b,0.47
phi-2,0.37


In [31]:
# Modified function to calculate average correctness by index within the output dictionary
# and use the value of the 'name' field from the first element in the JSON for the left column in the table.

def calculate_average_correctness_by_index(data):
    correctness_statistics_by_index = []
    num_outputs = len(data[0]['output'])  # Assuming all models have the same number of outputs

    # Initialize correctness statistics for each index
    for i in range(num_outputs):
        correctness_statistics_by_index.append({"total_correctness": 0, "total_fields": 0, "name": ""})

    # Set the name from the first model's output
    for i, item in enumerate(data[0]['output']):
        correctness_statistics_by_index[i]["name"] = item['name']['value']

    # Iterate over each model and each item in their output
    for model_data in data:
        for i, item in enumerate(model_data['output']):
            for key, value in item.items():
                if isinstance(value, dict) and 'correctness' in value:
                    correctness_statistics_by_index[i]["total_correctness"] += value['correctness']
                    correctness_statistics_by_index[i]["total_fields"] += 1

    # Calculate average correctness for each index
    for stat in correctness_statistics_by_index:
        if stat["total_fields"] > 0:
            stat["average_correctness"] = stat["total_correctness"] / stat["total_fields"]
        else:
            stat["average_correctness"] = 0

    return correctness_statistics_by_index

# Calculate correctness statistics by index
correctness_statistics_by_index = calculate_average_correctness_by_index(updated_models_data)

# Generate HTML table with names from the first model's output
def generate_html_table_by_index(data):
    html = "<table><tr><th>Output Name</th><th>Average Correctness</th></tr>"
    for stat in data:
        html += f"<tr><td>{stat['name']}</td><td>{stat['average_correctness']:.2f}</td></tr>"
    html += "</table>"
    return html

html_table_by_index = generate_html_table_by_index(correctness_statistics_by_index)
display(HTML(html_table_by_index))


Output Name,Average Correctness
Belliella baltica,0.76
Hongiella halophila,0.43
Nitratireductor aquibiodomus,0.31
Paenibacillus favisporus,0.57
Nocardia asiatica,0.36
Sulfitobacter delicatus,0.57
Cellulomonas terrae,0.83
Mycobacterium paraseoulense,0.67
Providencia sneebia,0.57
Psychrobacter lutiphocae,0.83
