In [1]:
import os
import subprocess
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import homogeneity_score, fowlkes_mallows_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from Bio import SeqIO, Entrez
import gzip
from sklearn.cluster import DBSCAN

In [None]:
# Load the CSV file into a DataFrame
pfam_df_split = pd.read_csv(r'C:\Users\m1lfslay3r6000\Music\ESS_569_Project\data\pfam_df_split.csv')
# Load the LCA file, assuming it's tab-delimited
lca_file = r"C:\Users\m1lfslay3r6000\Music\ESS_569_Project\data\raw\NPac.G3PA_diel.MarFERReT_v1.1_MMDB.lca.tab.gz"
# Extract the unique target_name values from pfam_df_split
target_names_pfam = pfam_df_split['target name'].unique()
# Load the lca_df file, but only keep rows where the 'target_name' matches those in pfam_df_split
lca_df_filtered = pd.read_csv(lca_file, delimiter="\t")

# Rename the first column of lca_df to 'target_name' to match pfam_df_split
lca_df_filtered.columns = ['target_name'] + list(lca_df_filtered.columns[1:])
lca_df_filtered = lca_df_filtered[lca_df_filtered['target_name'].isin(target_names_pfam)]

tax_id = lca_df_filtered.iloc[:, 1]

In [15]:
# Always set your email when using Entrez
Entrez.email = "andcha1118@gmail.com"
# Fetch taxonomy names using Entrez
def fetch_taxonomy_name(tax_id):
    try:
        # Make sure tax_id is a string
        tax_id = str(tax_id)
        handle = Entrez.efetch(db="taxonomy", id=tax_id, retmode="xml")
        records = Entrez.read(handle)
        handle.close()
        # Return the name of the taxa
        return records[0]['ScientificName']
    except Exception as e:
        print(f"Error fetching taxonomy for ID {tax_id}: {e}")
        return None

# Fetch taxonomy names and avoid duplicates
taxonomy_names = {}
for idx, row in lca_df_filtered.iterrows():  # Use lca_df instead of taxonomy_df
    tax_id = row[1]  # The second column is tax_id, based on the previous steps
    if tax_id != 0 and tax_id not in taxonomy_names:  # Skip tax_id 0 and duplicates
        taxonomy_names[tax_id] = fetch_taxonomy_name(tax_id)
# Create a new column 'taxonomy_name' in lca_df by matching tax_id with taxonomy_names
lca_df_filtered['taxonomy_name'] = lca_df_filtered['0'].map(taxonomy_names)
# Now, merge pfam_df_split with lca_df based on 'target_name' (from pfam_df_split) and the 'target_name' in lca_df
merged_df = pd.merge(pfam_df_split, lca_df_filtered, left_on='target name', right_on='target_name', how='left')
merged_df_clean = merged_df.dropna()

In [51]:
# Load the selected_organism_labels.csv
organism_labels_file = r"C:\Users\m1lfslay3r6000\Music\ESS_569_Project\data\ai_ready\organism_trophic_labels.csv"  # Replace with actual path
organism_labels_df = pd.read_csv(organism_labels_file)

# Rename 'organism name' to 'taxonomy_name'
organism_labels_df.rename(columns={'Organism Name': 'taxonomy_name'}, inplace=True)

In [52]:
# Match the first word or first two words of taxonomy_name
def match_taxonomy_name(taxonomy_name, organism_labels_df):
    # Extract the first word or first two words
    taxonomy_parts = taxonomy_name.split()
    first_two_words = ' '.join(taxonomy_parts[:2])  # First two words
    
    # Find matching trophism mode
    matched_rows = organism_labels_df[organism_labels_df['taxonomy_name'].str.contains(first_two_words, na=False)]
    
    return matched_rows['trophic_mode'].iloc[0] if not matched_rows.empty else None
merged_df_clean.loc[:, 'taxonomy_name'] = merged_df_clean['taxonomy_name'].astype(str)
merged_df_clean.loc['taxonomy_name_first_two'] = merged_df_clean['taxonomy_name'].apply(lambda x: ' '.join(x.split()[:2]))

# Safely create 'taxonomy_name_first_two' column, handling NaN values
merged_df_clean['taxonomy_name_first_two'] = merged_df_clean['taxonomy_name'].apply(
    lambda x: ' '.join(x.split()[:2]) if isinstance(x, str) else None
)

In [56]:
# Iterate through merged_df_clean and fill 'Trophic Mode' based on matching 'taxonomy_name_first_two'
for index, row in merged_df_clean.iterrows():
    # Get the first two words from the 'taxonomy_name_first_two' column
    tax_name_first_two = row['taxonomy_name_first_two']
    
    # Ensure tax_name_first_two is a string
    if isinstance(tax_name_first_two, str) and tax_name_first_two:
        # Search for the corresponding trophic_mode in organism_labels_df
        matched_row = organism_labels_df[organism_labels_df['taxonomy_name'].str.contains(tax_name_first_two, na=False, case=False)]
        
        if not matched_row.empty:
            # If there's a match, get the 'Trophic Mode' value
            trophic_mode = matched_row['Trophic Mode'].iloc[0]
            # Use loc to set the trophic_mode for the current row
            merged_df_clean.loc[index, 'Trophic Mode'] = trophic_mode
        else:
            # If no match is found, set trophic_mode to None or a placeholder value
            merged_df_clean.loc[index, 'Trophic Mode'] = None
    else:
        # If tax_name_first_two is not a valid string, handle the case
        merged_df_clean.loc[index, 'Trophic Mode'] = None
        


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_clean.loc[index, 'Trophic Mode'] = None


# The problem area 

In [None]:
# Drop unnecessary columns and keep only 'query_name', 'query_accession', and 'trophic_mode'
merged_df_clean = merged_df_clean[['target_name', 'accession', 'Trophic Mode']]

# Step 1: Preprocess the data
# Select relevant numeric columns (modify based on your actual data)
numeric_columns = ['target_name', 'accession', 'Trophic Mode']  # Replace with actual numeric columns
numeric_data = merged_df_clean[numeric_columns]

# Normalize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_data)

# Step 2: Run DBSCAN
# Set DBSCAN parameters
eps = 0.5  # Adjust as needed
min_samples = 5  # Adjust as needed

# Fit DBSCAN to the data
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
merged_df_clean['DBSCAN_cluster'] = dbscan.fit_predict(scaled_data)

In [None]:
# Standardize the features before performing PCA
scaler = StandardScaler()
df_scaled = scaler.fit_transform(merged_df_clean)

# Apply PCA to reduce the data to 2D for visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_scaled)

# Add PCA components to the dataframe for visualization
merged_df_clean["pca1"] = pca_result[:, 0]
merged_df_clean["pca2"] = pca_result[:, 1]
plt.figure(figsize=(8, 6))
plt.scatter(merged_df_clean["pca1"], merged_df_clean["pca2"], c=merged_df_clean["cluster_id"], cmap="viridis", alpha=0.5)
plt.title("PCA Visualization of Clusters")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.colorbar(label="Cluster ID")
plt.show()

# Calculate homogeneity and FMI if true labels are available
homogeneity = homogeneity_score(merged_df_clean["true_label"], merged_df_clean["cluster_id"])
print(f"Homogeneity Score: {homogeneity}")

fmi = fowlkes_mallows_score(merged_df_clean["true_label"], merged_df_clean["cluster_id"])
print(f"Fowlkes-Mallows Index: {fmi}")