In [93]:
from Bio import SeqIO
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [94]:
#Set working directory so that everything can be called uniformly
os.chdir('/Users/Andi')
print(os.getcwd())

/Users/Andi


Tab-separated file mapping protein groups to Orthomcl groups where the columns are labeled 'ID' for the OrthoMCl groups and 'ProteinID' for the uniprot entry. 
 

In [95]:

orth_chae =  pd.read_csv('Desktop/funpath/DB/orth_uniprot/chae_og7orth_uniprot.tsv', sep='\t')
orth_cauris = pd.read_csv('Desktop/funpath/DB/orth_uniprot/cauris_og7orth_uniprot.tsv', sep='\t')
orth_calbicans =  pd.read_csv('Desktop/funpath/DB/orth_uniprot/calbicans_og7orth_uniprot.tsv', sep='\t')

In [96]:
orth_calbicans.columns 

Index(['Group ID', 'Entry'], dtype='object')

In [97]:
orth_chae.rename(columns={'Entry':'ProteinID', 'Group ID':'ID'}, inplace=True)
orth_cauris.rename(columns={'Entry':'ProteinID', 'Group ID':'ID'}, inplace=True)
orth_calbicans.rename(columns={'Uniprot ID':'ProteinID', 'Group ID':'ID'}, inplace=True)

In [98]:
# Replace all NaN values with 0
orth_chae.fillna(0, inplace=True)
orth_cauris.fillna(0, inplace=True)
orth_calbicans.fillna(0, inplace=True)

In [99]:

# Define the path to your FASTA file
fasta_file_chae = "Desktop/funpath/DB/fasta_uniprot/chaemulonii_fasta_2025_05_09.fasta"
fasta_file_cauris = "Desktop/funpath/DB/fasta_uniprot/cauris_fasta_2025_05_09.fasta"
fasta_file_calbicans = "Desktop/funpath/DB/fasta_uniprot/calbicans_fasta_2025_05_09.fasta"


In [100]:

# Create lists to store the sequence IDs and sequences
chae_ids = []
chae_sequences = []
cauris_ids = []
cauris_sequences = []
calbicans_ids = []
calbicans_sequences = []

In [101]:
for record in SeqIO.parse(fasta_file_chae, "fasta"):
    chae_ids.append(record.id)
    chae_sequences.append(str(record.seq))

In [102]:
for record in SeqIO.parse(fasta_file_cauris, "fasta"):
    cauris_ids.append(record.id)
    cauris_sequences.append(str(record.seq))

In [103]:
for record in SeqIO.parse(fasta_file_calbicans, "fasta"):
    calbicans_ids.append(record.id)
    calbicans_sequences.append(str(record.seq))

In [104]:
# Create a DataFrame from the lists
chae_fastadf = pd.DataFrame({"ID": chae_ids, "Sequence": chae_sequences})
cauris_fastadf = pd.DataFrame({"ID": cauris_ids, "Sequence": cauris_sequences})
calbicans_fastadf = pd.DataFrame({"ID": calbicans_ids, "Sequence": calbicans_sequences})


In [105]:
# Function to extract the desired part
def extract_protein_id(entry):
    return entry.split('|')[1]



In [106]:
# Create a new column with the extracted ProteinID
chae_fastadf['ProteinID'] = chae_fastadf['ID'].apply(extract_protein_id)
cauris_fastadf['ProteinID'] = cauris_fastadf['ID'].apply(extract_protein_id)
calbicans_fastadf['ProteinID'] = calbicans_fastadf['ID'].apply(extract_protein_id)

In [110]:
# Define the output FASTA file path
haeoutput_fasta_file = "Desktop/funpath/DB/fasta_uniprot/chaeumulonii_uniprotfasta_justID_20250509.fasta"

# Function to write the DataFrame back into a FASTA format
with open(haeoutput_fasta_file, "w") as fasta_file:
    for index, row in chae_fastadf.iterrows():
        fasta_file.write(f">{row['ProteinID']}\n")
        fasta_file.write(f"{row['Sequence']}\n")


In [111]:
# Define the output FASTA file path
aurisoutput_fasta_file = "Desktop/funpath/DB/fasta_uniprot/cauris_uniprotfasta_justID_20250509.fasta"

# Function to write the DataFrame back into a FASTA format
with open(aurisoutput_fasta_file, "w") as fasta_file:
    for index, row in cauris_fastadf.iterrows():
        fasta_file.write(f">{row['ProteinID']}\n")
        fasta_file.write(f"{row['Sequence']}\n")


In [112]:
# Define the output FASTA file path
albicansoutput_fasta_file = "Desktop/funpath/DB/fasta_uniprot/calbicans_uniprotfasta_justID_20250509.fasta"

# Function to write the DataFrame back into a FASTA format
with open(albicansoutput_fasta_file, "w") as fasta_file:
    for index, row in calbicans_fastadf.iterrows():
        fasta_file.write(f">{row['ProteinID']}\n")
        fasta_file.write(f"{row['Sequence']}\n")


In [113]:
chae_fastadf.drop(columns=['ID' ], inplace=True)
cauris_fastadf.drop(columns=['ID' ], inplace=True)
calbicans_fastadf.drop(columns=['ID' ], inplace=True)

In [114]:
orth_chae.to_csv('Desktop/funpath/DB/fasta_orth/chae_orth_fastaconcat.tsv', sep='\t', index=False)
orth_cauris.to_csv('Desktop/funpath/DB/fasta_orth/cauris_orth_fastaconcat.tsv', sep='\t', index=False)
orth_calbicans.to_csv('Desktop/funpath/DB/fasta_orth/calbicans_orth_fastaconcat.tsv', sep='\t', index=False)

In [69]:
testdf = chae_fastadf.head(5)

In [74]:
# Define the output FASTA file path
output_fasta_file = "Desktop/funpath/DB/test_fasta.fasta"

# Function to write the DataFrame back into a FASTA format
with open(output_fasta_file, "w") as fasta_file:
    for index, row in testdf.iterrows():
        fasta_file.write(f">{row['ProteinID']}\n")
        fasta_file.write(f"{row['Sequence']}\n")


In [116]:
# Define the data
data = {
    "ProteinID": ["A0A2V1AUX0", "A0A2V1AQQ6", "A0A2V1ALF0", "A0A2V1AMQ3", "A0A2V1AND4"],
    "ID": ["OG6_500299", "OG6_500299", "OG6_500299", "OG6_500299", "OG6_500299"]
}

# Create the DataFrame
testref_df = pd.DataFrame(data)

In [117]:
testref_df.to_csv('Desktop/funpath/DB/testref_df.tsv', sep='\t', index=False)

In [None]:
#This uses concat_ortho_proteins.py to concatonate fastas based upon OGs assigned by orthomcl

python concat_ortho_proteins.py -m DB/testref_df.tsv -f DB/test_fasta.fasta -o DB/test_concatorthfasta.fasta

python concat_ortho_proteins.py -m DB/fasta_orth/chae_orth_fastaconcat.tsv -f DB/fasta_uniprot/chae_uniprotfasta_justID_20250509.fasta -o DB/fasta_orth/chae_fasta_orth_concat.fasta

python concat_ortho_proteins.py -m DB/fasta_orth/cauris_orth_fastaconcat.tsv -f DB/fasta_uniprot/cauris_uniprotfasta_justID_20250509.fasta -o DB/fasta_orth/cauris_fasta_orth_concat.fasta

python concat_ortho_proteins.py -m DB/fasta_orth/calbicans_orth_fastaconcat.tsv -f DB/fasta_uniprot/calbicans_uniprotfasta_justID_20250509.fasta -o DB/fasta_orth/calbicans_fasta_orth_concat.fasta
