In [None]:
import biotite.structure.io.pdbx as pdbx
import os
import glob
import csv
import pandas as pd
import numpy as np
from biotite.sequence import ProteinSequence, NucleotideSequence, GeneralSequence

amino_acids = ['H', 'R', 'K', 'I', 'F', 'L', 'W', 'A', 'M',
               'P', 'C', 'N', 'V', 'G', 'S', 'Q', 'Y', 'D', 'E', 'T']

output_csv = "protein_frame.csv"

headers = ["FileName", "FilePath", "SpeciesSciName", "FullSequence", "SequenceByChain",
           "NumOfChains", "ChainNames", "UniqueAminoAcids",
           "Helix_Count", "Beta_Count", "Coil_Count",
           "HelixByChain", "BetaByChain", "CoilByChain", "ProteinChain", "NucleaotideChain", "GeneralChain", "HelixSeq","SSEARRAY"]


os.chdir(r"C:\Users\Darsh\Downloads\store-sales-time-series-forecasting")

def count_elements_by_type(ss_array, element_code):
    count = 0
    inside = False
    for ss in ss_array:
        if ss == element_code and not inside:
            count += 1
            inside = True
        elif ss != element_code:
            inside = False
    return count
def extract_species_name(cif_path):
    keyword = "_entity_src_gen.pdbx_gene_src_scientific_name"
    with open(cif_path, "r") as f:
        lines = f.readlines()
    for line in lines:
        if line.startswith(keyword):
            parts = line.split(keyword, 1)
            if len(parts) > 1:
                return parts[1].strip().strip('"')
    return "N/A"
def filter_sse(sequence, ss_elements, exclude_types=('b', 'c')):
    result = {}
    for chain_id in sequence:
        seq = str(sequence[chain_id])
        sse_array = ss_elements[chain_id]
        chain_result = []
        start = 0
        for i in range(1, len(seq) + 1):
            if i == len(seq) or sse_array[i] != sse_array[start]:
                sse_type = sse_array[start]
                if sse_type not in exclude_types:
                    group_seq = seq[start:i]
                    chain_result.append(group_seq)
                start = i
        result[chain_id] = chain_result
    return result
with open(output_csv, mode="w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=headers)
    writer.writeheader()
    file_count = 0
    for cif_file_path in list(glob.glob('**/*.cif'))[:5]:
        total_counts = {"helix": 0, "beta": 0, "coil": 0}
        aa_counts = {acid: {"helix": 0, "beta": 0, "coil": 0} for acid in amino_acids}
        try:
            with open(cif_file_path, "r") as file:
                cif_file = pdbx.CIFFile.read(file)
            sequence_by_chain = pdbx.get_sequence(cif_file)
            sequence = pdbx.get_sequence(cif_file)
            ss_elements = pdbx.get_sse(cif_file)
        except Exception as e:
            print(f"Failed to parse {cif_file_path}: {e}")
            continue
        Helix_chain = {
            chain: count_elements_by_type(ss_elements[chain], "a")
            for chain in sequence_by_chain.keys()
        }
        Beta_chain = {
            chain: count_elements_by_type(ss_elements[chain], "b")
            for chain in sequence_by_chain.keys()
        }
        Coil_chain = {
            chain: count_elements_by_type(ss_elements[chain], "c")
            for chain in sequence_by_chain.keys()
        }

        for chain_arr in ss_elements.values():
            total_counts["helix"] += np.count_nonzero(chain_arr == "a")
            total_counts["beta"] += np.count_nonzero(chain_arr == "b")
            total_counts["coil"] += np.count_nonzero(chain_arr == "c")

        for chain_id, seq in sequence.items():
            seq_str = str(seq)
            ss_arr = ss_elements.get(chain_id)
            if ss_arr is None:
                continue
            for i, residue in enumerate(seq_str):
                if i < len(ss_arr):
                    ss_type = ss_arr[i]
                    if residue in aa_counts:
                        if ss_type == "a":
                            aa_counts[residue]["helix"] += 1
                        elif ss_type == "b":
                            aa_counts[residue]["beta"] += 1
                        elif ss_type == "c":
                            aa_counts[residue]["coil"] += 1

        file_name = os.path.splitext(os.path.basename(cif_file_path))[0]
        full_seq = "".join(str(seq) for seq in sequence_by_chain.values())
        unique_aas = sorted(set(full_seq))
        file_aa_counts = {f"{aa}_Count": full_seq.count(aa) for aa in amino_acids}
        seq_by_chain_str = "; ".join([f"{chain}:{seq}" for chain, seq in sequence_by_chain.items()])
        Chain_Names = "; ".join(sequence_by_chain.keys())
        helix_values = [aa_counts[acid]["helix"] for acid in amino_acids]
        beta_values = [aa_counts[acid]["beta"] for acid in amino_acids]
        coil_values = [aa_counts[acid]["coil"] for acid in amino_acids]
        protein_types = {chain_id: isinstance(chain_seq, ProteinSequence) for chain_id, chain_seq in pdbx.get_sequence(cif_file).items()}
        nucelotide_types = { chain_id: isinstance(chain_seq, NucleotideSequence) for chain_id, chain_seq in pdbx.get_sequence(cif_file).items()}
        General_types = {chain_id: isinstance(chain_seq, GeneralSequence) for chain_id, chain_seq in pdbx.get_sequence(cif_file).items()}
        Helix_seq = filter_sse(sequence, ss_elements)
        ssarray = ''.join(
            s
            for arr in ss_elements.values()
            for s in arr
            if s in {'c', 'a', 'b'}
        )

        row = {
            "FileName": file_name,
            "FilePath": os.path.abspath(cif_file_path),
            "FullSequence": full_seq,
            "SequenceByChain": seq_by_chain_str,
            "NumOfChains": len(sequence_by_chain),
            "ChainNames": Chain_Names,
            "UniqueAminoAcids": "".join(unique_aas),
            "Helix_Count": helix_values,
            "Beta_Count": beta_values,
            "Coil_Count": coil_values,
            "HelixByChain": Helix_chain,
            "BetaByChain": Beta_chain,
            "CoilByChain": Coil_chain,
            "ProteinChain": protein_types,
            "NucleaotideChain": nucelotide_types,
            "GeneralChain": General_types,
            "SpeciesSciName": extract_species_name(cif_file_path),
            "HelixSeq" : Helix_seq,
            "SSEARRAY" : ssarray
            
        }
        writer.writerow(row)
        file_count +=1
        if file_count % 1000 == 0:
            print(f"{file_count} files done")
# put your csv path below
df = pd.read_csv(r"path")
df.head()
import pandas as pd
excel_file = 'ProteinDF.xlsx'
df.to_excel(excel_file, index=False, engine='openpyxl')
# you can now use this to upload your own data on the dashboard 

FileNotFoundError: [Errno 2] No such file or directory: 'path'