In [2]:
import pandas as pd
import re
from multiprocessing.pool import ThreadPool
from Bio import SeqIO, Entrez
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import molecular_weight as mol_weight_general

# open file and read it
with open('data.txt', 'r') as f:
    data = f.read()

# seperate data by tab and \n
data = re.split('\t|\n', data)

# create a dataframe from the data with 8 columns
df = pd.DataFrame([data[i:i+8] for i in range(0, len(data), 8)])

# rename the columns
columns = {
    0:"Interactor 1 Gene symbol",
    1:"Interactor 1 HPRD id",
    2:"Interactor 1 RefSeq id",
    3:"Interactor 2 Gene symbol",
    4:"Interactor 2 HPRD id",
    5:"Interactor 2 RefSeq id",
    6:"Experiment type (in vivo, in vitro and yeast 2-hybrid)",
    7:"Pubmed id",
}
df.rename(columns=columns, inplace=True)


# drop the last column
df.drop(df.columns[7], axis=1, inplace=True)

# save the dataframe to a csv file
df.to_csv('data.csv', index=False)

In [4]:
# setting up the email address for Entrez
Entrez.email = 'omaratyqy@gmail.com'

# read the database into a pandas dataframe
df = pd.read_csv('data.csv')

# create a list of gene symbols
gene_symbols = list(set(df['Interactor 1 Gene symbol'].tolist() + df['Interactor 2 Gene symbol'].tolist()))
print(gene_symbols)

# create an empty dictionary to store the sequences
sequences = {}

# retrieve the protein sequences in batches of 500
for i in range(0, len(gene_symbols), 500):
    gene_symbols_batch = gene_symbols[i:i+500]
    
    # convert the list of gene symbols to a search term for Entrez
    search_term = ' OR '.join(str(gene_symbol) for gene_symbol in gene_symbols_batch)

    # search for the gene symbols in the NCBI protein database
    handle = Entrez.esearch(db='protein', term=search_term, retmax=500)
    record = Entrez.read(handle)

    # retrieve the protein sequences for the matching gene symbols
    if record['IdList']:
        protein_ids = record['IdList']
        handle = Entrez.efetch(db='protein', id=protein_ids, rettype='fasta', retmode='text')
        seq_records = list(SeqIO.parse(handle, 'fasta'))

        # store the sequences in the dictionary
        for seq_record in seq_records:
            gene_symbol = seq_record.id.split('|')[0]
            sequences[gene_symbol] = str(seq_record.seq)

    # print the current length of the dictionary divided by the total number of gene symbols as a progress indicator
    progress = round(len(sequences) / len(gene_symbols) * 100, 2)
    print(f"Current progress: {progress}%")


# print the result
print(f"Total number of sequences: {len(sequences)}")

['KIR3DS1', 'MYL2', 'FAM86A', 'PDCD6IP', 'TSFM', 'CCNC', 'SCARB1', 'CTCF', 'LY6H', 'DPF1', 'ACTN3', 'CDH6', 'BRD2', 'SNAP91', 'HIST1H2AI', 'RARRES3', 'FHIT', 'FBF1', 'DEDD', 'TXNDC11', 'DOT1L', 'COL9A1', 'RFWD2', 'IRF2', 'CHFR', 'SLIT2', 'LAMC1', 'NUP210', 'GTF2H1', 'HTRA2', 'ISL1', 'SFRS2B', 'AXIN1', 'TYROBP', 'RWDD1', 'PPP2R5C', 'KAT2B', 'TRIM23', 'TAF6', 'F8', 'UBR4', 'DDR1', 'ILK', 'CHM', 'SNF8', 'OTX2', 'ARHGAP19', 'PLXND1', 'AK2', 'SNRPB2', 'CDCA3', 'DVL1', 'BAT5', 'SHB', 'FLT3', 'SAE1', 'GLI1', 'FAF1', 'PAFAH1B2', 'EVC2', 'WIPF3', 'DCP1A', 'CHRDL2', 'ARNTL2', 'CTBP1', 'MED17', 'SUPT4H1', 'ZNF227', 'GTF2E2', 'HIST1H2AE', 'FAM82A2', 'NPR3', 'SULT2B1', 'FST', 'XRCC6BP1', 'TPR', 'NLGN4X', 'VDAC2', 'STBD1', 'KIDINS220', 'NR4A2', 'ENTPD6', 'ACTR3', 'GCNT1', 'DNAJB4', 'CHRNE', 'ITGA5', 'SSFA2', 'CALCA', 'KRT2', 'PPP2R3A', 'DNAJB1', 'VPREB3', 'RAG2', 'SMPD4', 'HHLA3', 'ABRA', 'CCL8', 'F11R', 'TBP', 'CCL22', 'CHMP4B', 'ERBB2IP', 'VCL', 'GTPBP1', 'ATPAF2', 'SHC2', 'SET', 'PRDX1', 'CD2', '

KeyboardInterrupt: 

In [None]:
# write the sequences to a json file for later use
import json
with open('sequences.json', 'w') as f:
    json.dump(sequences, f)

In [None]:
protein_sequence = "MFALFAGSFILACGPKNTDEKQAVDQRSGRMIEDYGSWISTISAEDVYGPSDRIGVLQSVDEALYFSLSDSENEGKVGIKRLALKGELTEAVSAQFDVKTRVHEYGGAPLLGIGNSLFAVKRDDQRLYRFAPNQEPVALTPNGTRHADCVSYPKGSRLVCVREDHRQPGTPTSSLVSLNLNFKNEGQTLFSGQDFYASPRVSPDNSQLAWISWQHPNMPWDITQLWVADLTPAGDVGEPKQLLAGHSGSITQPLYSPSGELYFIADFDNWWNIYRITAEGTLEKVLQESAEFAVPDWMMGNHNYAFESEDTIIASYKSKGKTELVRIDLDSGLVSTIAAEFADVRQVIKGSDGVYFVGNKATPEKGIYRVKGRGVELVYAPQLPVVDPNYIARAQTISFTSAKCDTPVHGYYYGPRNPNYLSPSDQRPPLLLMMHGGPTASASLSFRRDIQFWTSRGFAVLDLNYRGSSGFGRDYRRSLYGLWGQADVEDAVQAANYLVERGWVDGKKLAIRGSSAGGFTVLSALAFYDTFSAGVVYSGISDLDSLDKLTHKFEQGYLDQLVGDLKPGSSVYRDRSPLYHLEQLTEPLLLIRGLDDPIVPPDQSLSIFYTLKSRGVPSALLSYEDEGHGLQKPANQIAALEAELSFYGQVFGFKPAGNIQTLTLDNSEHLPVMDPQVR"


# Create a ProteinAnalysis object
protein_analyzer = ProteinAnalysis(protein_sequence)

# Calculate the molecular weight
mol_weight = mol_weight_general(protein_sequence, seq_type='protein')

# Calculate the isoelectric point
pI = protein_analyzer.isoelectric_point()

# Calculate amino acid composition
aa_composition = protein_analyzer.count_amino_acids()

# Calculate secondary structure fraction
sec_structure_fraction = protein_analyzer.secondary_structure_fraction()

print("Molecular weight:", mol_weight)
print("Isoelectric point:", pI)
print("Amino acid composition:", aa_composition)
print("Secondary structure fraction:", sec_structure_fraction)


Molecular weight: 74800.0228000004
Isoelectric point: 5.177372932434081
Amino acid composition: {'A': 51, 'C': 4, 'D': 47, 'E': 34, 'F': 29, 'G': 60, 'H': 12, 'I': 29, 'K': 27, 'L': 68, 'M': 8, 'N': 23, 'P': 40, 'Q': 31, 'R': 35, 'S': 61, 'T': 30, 'V': 46, 'W': 11, 'Y': 32}
Secondary structure fraction: (0.31710914454277284, 0.2713864306784661, 0.2374631268436578)
