# For clustering of protein sequences and the following analysis

In [1]:
import os
import subprocess
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import homogeneity_score, fowlkes_mallows_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from Bio import SeqIO, Entrez
import gzip
from sklearn.cluster import DBSCAN

In [2]:
def read_fasta(file_path):
    """Reads a FASTA file and returns protein sequences and labels."""
    sequences = []
    labels = []
    
    # Replace this with your logic to assign labels based on your dataset
    for record in SeqIO.parse(file_path, "fasta"):
        sequences.append(str(record.seq))
        
        # Example: Assigning labels based on some logic (modify as needed)
        label = get_tropism_label(record.id)  # This should return the correct label
        labels.append(label)
    
    return sequences, labels

def get_tropism_label(record_id):
    """Assign tropism labels based on sequence ID or other logic."""
    # Example of how you might determine a label from the sequence ID
    # Modify according to your specific dataset and labeling method
    if "TrophismTypeA" in record_id:
        return 0
    elif "TrophismTypeB" in record_id:
        return 1
    else:
        return 2  # Default case, adjust as needed

In [3]:
# Load the domblout.tab file (Pfam data)
pfam_file = r"C:\Users\m1lfslay3r6000\Music\ESS_569_Project\data\raw\G3PA_diel.Pfam35.domtblout.tab.gz"
pfam_df = pd.read_csv(pfam_file, sep='\t', compression='gzip', header=None)
# Set the first row as the header
pfam_df.columns = pfam_df.iloc[0]  # Set first row as column names
pfam_df = pfam_df.drop(0)  # Drop the first row as it's now the header
pfam_df_subset = pfam_df.iloc[:1000]
print(pfam_df_subset.head())  # Check the first few rows of the subset

0 #                                                                                                      --- full sequence --- -------------- this domain -------------   hmm coord   ali coord   env coord
1  # target name                                 ...                                                                                                                                                       
2  #                          -------------------...                                                                                                                                                       
3  G3PA.diel.S4C7.B_TRINITY_DN22282_c0_g5_i1_1   ...                                                                                                                                                       
4  G3PA.diel.S4C7.B_TRINITY_DN22282_c0_g5_i1_1   ...                                                                                                                                    

In [4]:
# Split the first row to extract column names
column_names = pfam_df_subset.iloc[0, 0].split()  # Split the header by spaces
# Combine 'target' and 'name' (columns 0 and 1)
column_names[1] = column_names[1] + ' ' + column_names[2]  # Combine the first and second columns
column_names.pop(2)  # Remove the second part since it's now combined with the first

# Combine columns 12, 13, and 14 into one column
column_names[11] = column_names[11] + ' ' + column_names[12] + ' ' + column_names[13]

column_names.pop(12)  # Remove the second part of the combination

column_names.pop(12)  # Remove the third part of the combination since they are now combined

# Combine columns 22, 23, and 24 into one column
column_names[22] = column_names[22] + ' ' + column_names[23] + ' ' + column_names[24]


# Remove the second and third parts (original columns 22 and 23)
column_names.pop(23)  # Remove the second part of the combination
column_names.pop(23)  # Remove the third part of the combination (same index after the previous pop)

# Now column_names has the updated column names

column_names.pop(0)

column_names.pop(1)
column_names[2] = column_names[2] + ' ' + column_names[3] 
column_names.pop(3)
# Step 1: Remove the first and second rows
pfam_df_subset = pfam_df_subset.drop([1, 2])
pfam_df_subset = pfam_df_subset.reset_index(drop=True)
pfam_df_split = pfam_df_subset.iloc[:, 0].str.split(expand=True)
# Combine the second and third rows
pfam_df_split.iloc[:, 1] = pfam_df_split.iloc[:, 1] + ' ' + pfam_df_split.iloc[:, 2]

# Drop the third column (index 2) after combining columns
pfam_df_split = pfam_df_split.drop(pfam_df_split.columns[2], axis=1)
pfam_df_split.pop(9)
pfam_df_split.pop(10)
pfam_df_split = pfam_df_split.reset_index(drop=True)
# Set the updated column names
pfam_df_split.columns = column_names
pfam_df_split.to_csv(r'C:\Users\m1lfslay3r6000\Music\ESS_569_Project\data\pfam_df_split.csv', index=False)

'name'