# For clustering of protein sequences and the following analysis

In [1]:
import os
import subprocess
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import homogeneity_score, fowlkes_mallows_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from Bio import SeqIO, Entrez
import gzip
from sklearn.cluster import DBSCAN

In [2]:
def read_fasta(file_path):
    """Reads a FASTA file and returns protein sequences and labels."""
    sequences = []
    labels = []
    
    # Replace this with your logic to assign labels based on your dataset
    for record in SeqIO.parse(file_path, "fasta"):
        sequences.append(str(record.seq))
        
        # Example: Assigning labels based on some logic (modify as needed)
        label = get_tropism_label(record.id)  # This should return the correct label
        labels.append(label)
    
    return sequences, labels

def get_tropism_label(record_id):
    """Assign tropism labels based on sequence ID or other logic."""
    # Example of how you might determine a label from the sequence ID
    # Modify according to your specific dataset and labeling method
    if "TrophismTypeA" in record_id:
        return 0
    elif "TrophismTypeB" in record_id:
        return 1
    else:
        return 2  # Default case, adjust as needed

In [None]:
# Load the domblout.tab file (Pfam data)
pfam_file = r"C:\Users\m1lfslay3r6000\Music\ESS_569_Project\data\raw\G3PA_diel.Pfam35.domtblout.tab.gz"
pfam_df = pd.read_csv(pfam_file, sep='\t', compression='gzip', header=None)
# Set the first row as the header
pfam_df.columns = pfam_df.iloc[0]  # Set first row as column names
pfam_df = pfam_df.drop(0)  # Drop the first row as it's now the header
print(pfam_df.head(3))

0 #                                                                                                      --- full sequence --- -------------- this domain -------------   hmm coord   ali coord   env coord
1  # target name                                 ...                                                                                                                                                       
2  #                          -------------------...                                                                                                                                                       
3  G3PA.diel.S4C7.B_TRINITY_DN22282_c0_g5_i1_1   ...                                                                                                                                                       


In [None]:
# Now you should have a proper DataFrame with columns separated
# Extract the 'target_name' and 'query_accession' columns by their names
target_name_column = pfam_df["target name"]
query_accession_column = pfam_df["query accession"]

# Display the extracted columns
print(target_name_column.head())
print(query_accession_column.head())

In [None]:
pfam_df = pfam_df.iloc[:, [0, 1]]  # Select only the first two columns (target_name and query-accession)
pfam_df.columns = ['target_name', 'query_accession']  # Rename for clarity

In [None]:
# Load the taxonomy file (Assuming the file is in a similar format)
taxonomy_file = r"C:\Users\m1lfslay3r6000\Music\ESS_569_Project\data\raw\NPac.G3PA_diel.bf100.id99.aa.fasta.gz"
taxonomy_df = pd.read_csv(taxonomy_file, sep='\t')  # Adjust separator if needed
taxonomy_df = taxonomy_df[['query_name', 'tax_id']]  # Only keep query_name and tax_id columns

# Fetch taxonomy names using Entrez
def fetch_taxonomy_name(tax_id):
    try:
        # Make sure tax_id is a string
        tax_id = str(tax_id)
        handle = Entrez.efetch(db="taxonomy", id=tax_id, retmode="xml")
        records = Entrez.read(handle)
        handle.close()
        # Return the name of the taxa
        return records[0]['ScientificName']
    except Exception as e:
        print(f"Error fetching taxonomy for ID {tax_id}: {e}")
        return None

# Fetch taxonomy names and avoid duplicates
taxonomy_names = {}
for idx, row in taxonomy_df.iterrows():
    tax_id = row['tax_id']
    if tax_id != 0 and tax_id not in taxonomy_names:  # Skip tax_id 0 and duplicates
        taxonomy_names[tax_id] = fetch_taxonomy_name(tax_id)

# Add taxonomy names to the taxonomy_df
taxonomy_df['taxonomy_name'] = taxonomy_df['tax_id'].map(taxonomy_names)

# Merge pfam_df with taxonomy_df based on query_name
merged_df = pd.merge(pfam_df, taxonomy_df[['query_name', 'taxonomy_name']], on='target_name', how='left')

# Load the selected_organism_labels.csv
organism_labels_file = "path_to_selected_organism_labels.csv"  # Replace with actual path
organism_labels_df = pd.read_csv(organism_labels_file)

# Rename 'organism name' to 'taxonomy_name'
organism_labels_df.rename(columns={'organism name': 'taxonomy_name'}, inplace=True)

# Match the first word or first two words of taxonomy_name
def match_taxonomy_name(taxonomy_name, trophism_df):
    # Extract the first word or first two words
    taxonomy_parts = taxonomy_name.split()
    first_two_words = ' '.join(taxonomy_parts[:2])
    
    # Find matching trophism mode
    matched_rows = trophism_df[trophism_df['taxonomy_name'].str.contains(first_two_words, na=False)]
    
    return matched_rows['trophic_mode'].iloc[0] if not matched_rows.empty else None

# Assign trophism modes based on the matching taxonomy_name
merged_df['trophic_mode'] = merged_df['taxonomy_name'].apply(lambda x: match_taxonomy_name(x, organism_labels_df))

# Drop unnecessary columns and keep only 'query_name', 'query_accession', and 'trophic_mode'
merged_df = merged_df[['target_name', 'query_accession', 'trophic_mode']]

# Optionally, sample the data to reduce memory load while maintaining the same distribution of trophic modes
sampled_df = merged_df.groupby('trophic_mode', group_keys=False).apply(lambda x: x.sample(frac=0.1))  # Adjust fraction as needed

# Run DBSCAN (Ensure you have the necessary features for clustering)

# Standardize features for clustering
scaler = StandardScaler()
scaled_df = scaler.fit_transform(sampled_df[['query_accession']])  # Adjust columns for clustering if necessary

# Perform DBSCAN clustering
db = DBSCAN(eps=0.5, min_samples=5).fit(scaled_df)

# Add cluster labels to the dataframe
sampled_df['cluster'] = db.labels_

# Analyze the clusters (for example, top 10 most abundant pfam ids per cluster)
cluster_summary = sampled_df.groupby('cluster')['trophic_mode'].value_counts().nlargest(10)

# Display or save the results
sampled_df.to_csv('clustered_output.csv', index=False)
print("Clustering completed and saved.")


## Step 5: Visualize the clusters using PCA

In [None]:
# Standardize the features before performing PCA
scaler = StandardScaler()
df_scaled = scaler.fit_transform(merged_df)

# Apply PCA to reduce the data to 2D for visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_scaled)

# Add PCA components to the dataframe for visualization
merged_df["pca1"] = pca_result[:, 0]
merged_df["pca2"] = pca_result[:, 1]

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(merged_df["pca1"], merged_df["pca2"], c=merged_df["cluster_id"], cmap="viridis", alpha=0.5)
plt.title("PCA Visualization of Clusters")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.colorbar(label="Cluster ID")
plt.show()

## Step 6: Evaluate Clustering (using homogeneity and Fowlkes-Mallows Index)

In [None]:
# Calculate homogeneity and FMI if true labels are available
homogeneity = homogeneity_score(merged_df["true_label"], merged_df["cluster_id"])
print(f"Homogeneity Score: {homogeneity}")

fmi = fowlkes_mallows_score(merged_df["true_label"], merged_df["cluster_id"])
print(f"Fowlkes-Mallows Index: {fmi}")