<a href="https://colab.research.google.com/github/Angsumi/GPT_VISION_FOR_Animal_Classification/blob/main/GPT_Vision_Biological_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from wordcloud import WordCloud
import seaborn as sns


In [None]:
# Load Excel sheets into dataframes

xls = pd.ExcelFile('/mnt/data/Species List.xlsx')
input_df = xls.parse('Input')
output_df = xls.parse('Output')


In [None]:
# Data Cleaning Function

def clean_dataframe(df):
    df.columns = [col.capitalize().strip() for col in df.columns]
    for col in df.columns:
        if df[col].dtype == 'object':
            if col == 'Species':
                df[col] = df[col].str.split().str[1].str.capitalize().fillna(df[col].str.split().str[0].str.capitalize())
            else:
                df[col] = df[col].str.split().str[0].str.capitalize().str.strip()
    return df

input_df_cleaned = clean_dataframe(input_df.copy())
output_df_cleaned = clean_dataframe(output_df.copy())


In [None]:
# Pie Diagrams for Empty Predictions

fig, axes = plt.subplots(1, output_df_cleaned.shape[1]-1, figsize=(20, 4))
for idx, col in enumerate(output_df_cleaned.columns[1:]):
    counts = ["Predicted", "Unable to predict"]
    values = [output_df_cleaned[col].notna().sum(), output_df_cleaned[col].isna().sum()]
    axes[idx].pie(values, labels=counts, autopct='%1.1f%%', startangle=90, colors=['#4CAF50', '#FFC107'])
    axes[idx].set_title(col)
plt.tight_layout()
plt.show()


In [None]:
# Accuracy Calculation

accuracies = {}
for col in input_df_cleaned.columns[1:]:
    correct_predictions = (input_df_cleaned[col] == output_df_cleaned[col]).sum()
    total_entries = input_df_cleaned.shape[0]
    accuracy = correct_predictions / total_entries
    accuracies[col] = accuracy
accuracies


In [None]:
# Word Cloud for Misclassifications

def get_misclassified_terms(input_df, output_df, rank):
    misclassified = input_df[input_df[rank] != output_df[rank]]
    return misclassified[rank].value_counts().to_dict()

ranks = ['Phylum', 'Class', 'Order', 'Family']
fig, axes = plt.subplots(1, len(ranks), figsize=(20, 5))
for idx, rank in enumerate(ranks):
    misclassified_terms = get_misclassified_terms(input_df_cleaned, output_df_cleaned, rank)
    wc = WordCloud(background_color='white', width=400, height=400).generate_from_frequencies(misclassified_terms)
    axes[idx].imshow(wc, interpolation='bilinear')
    axes[idx].axis('off')
    axes[idx].set_title(f'Misclassified {rank}')
plt.tight_layout()
plt.show()


In [None]:
# Precision, Recall, and F1-Score Calculation

precision_list = []
recall_list = []
f1_list = []
accuracy_list = []

for col in input_df_cleaned.columns[1:]:
    mask = input_df_cleaned[col].notna() & output_df_cleaned[col].notna()
    y_true = input_df_cleaned[mask][col]
    y_pred = output_df_cleaned[mask][col]

    precision_list.append(precision_score(y_true, y_pred, average='weighted'))
    recall_list.append(recall_score(y_true, y_pred, average='weighted'))
    f1_list.append(f1_score(y_true, y_pred, average='weighted'))
    accuracy_list.append((y_true == y_pred).mean())

metrics_df = pd.DataFrame({
    'Classification Level': input_df_cleaned.columns[1:],
    'Accuracy': accuracy_list,
    'Precision': precision_list,
    'Recall': recall_list,
    'F1-Score': f1_list
})
metrics_df


In [None]:
# Calculate accuracy for each taxonomic level considering and not considering null values
accuracies_with_null = []
accuracies_without_null = []

columns = output_df_cleaned.columns[1:]

for col in columns:
    total_values = output_df_cleaned.shape[0]

    # Correct predictions (both dataframes have the same non-null value)
    correct_predictions = (input_df_cleaned[col] == output_df_cleaned[col]).sum()

    # Not considering null values: only consider rows where both dataframes have non-null values
    not_null_rows = (input_df_cleaned[col].notna() & output_df_cleaned[col].notna()).sum()

    # Calculate accuracies
    accuracy_with_null = correct_predictions / total_values
    accuracy_without_null = correct_predictions / not_null_rows if not_null_rows != 0 else 0

    accuracies_with_null.append(accuracy_with_null)
    accuracies_without_null.append(accuracy_without_null)

# Plotting the accuracies
fig, ax = plt.subplots(figsize=(12, 6))

# Bar positions
bar_width = 0.35
index = range(len(columns))

bar1 = ax.bar(index, accuracies_with_null, bar_width, label='With Null Values', color='b')
bar2 = ax.bar([i + bar_width for i in index], accuracies_without_null, bar_width, label='Without Null Values', color='r')

# Labeling the plot
ax.set_xlabel('Taxonomic Level')
ax.set_ylabel('Accuracy')
ax.set_title('Accuracy at Each Taxonomic Level')
ax.set_xticks([i + bar_width / 2 for i in index])
ax.set_xticklabels(columns, rotation=45)
ax.legend()

plt.tight_layout()
plt.show()


In [None]:
# Create a confusion matrix for classification level (Phylusm, Class, order).
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns

# Function to plot confusion matrix for a given rank
def plot_confusion_matrix(rank, input_df, output_df):
    # Filter out rows with NaN values in the considered rank for both input and output dataframes
    mask = input_df[rank].notna() & output_df[rank].notna()
    y_true = input_df[mask][rank]
    y_pred = output_df[mask][rank]

    # Compute confusion matrix
    labels = sorted(list(set(y_true).union(set(y_pred))))
    cm = confusion_matrix(y_true, y_pred, labels=labels)

    # Plot
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels, ax=ax)
    ax.set_ylabel('True Label')
    ax.set_xlabel('Predicted Label')
    ax.set_title(f'Confusion Matrix for {rank}')
    plt.show()

# Plot confusion matrices for 'Phylum', 'Class', and 'Order'
for rank in ['Phylum', 'Class', 'Order']:
    plot_confusion_matrix(rank, input_df_cleaned, output_df_cleaned)
