In [5]:
import dask.dataframe as dd
import pandas as pd

# Read the InterPROscan output file into a Dask DataFrame
filename = '/data/dataprocessing/interproscan/all_bacilli.tsv'
df = dd.read_csv(filename, sep='\t', dtype=str, header=None, names=["0", "1", "2","3","4","5","6","7","8","9","10","11","12","13","14"])


In [6]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,gi|29898682|gb|AAP11954.1|,92d1264e347e149248231cb9b649388c,547,TIGRFAM,TIGR03882,cyclo_dehyd_2: bacteriocin biosynthesis cyclod...,2,131,1.6e-21,T,25-04-2022,IPR022291,"Bacteriocin biosynthesis, cyclodehydratase domain",-,-
1,gi|29898682|gb|AAP11954.1|,92d1264e347e149248231cb9b649388c,547,TIGRFAM,TIGR03604,TOMM_cyclo_SagD: thiazole/oxazole-forming pept...,161,547,0.0,T,25-04-2022,IPR027624,"Thiazole/oxazole-forming peptide maturase, Sag...",-,-
2,gi|29898682|gb|AAP11954.1|,92d1264e347e149248231cb9b649388c,547,ProSiteProfiles,PS51664,YcaO domain profile.,159,547,75.396477,T,25-04-2022,IPR003776,YcaO-like domain,-,-
3,gi|29898682|gb|AAP11954.1|,92d1264e347e149248231cb9b649388c,547,Gene3D,G3DSA:3.30.160.660,-,311,452,9.099999999999999e-46,T,25-04-2022,-,-,,
4,gi|29898682|gb|AAP11954.1|,92d1264e347e149248231cb9b649388c,547,Gene3D,G3DSA:3.30.40.250,-,191,274,9.099999999999999e-46,T,25-04-2022,-,-,,


In [10]:

# 1. How many distinct protein annotations are found in the dataset?
distinct_annotations = df['11'].nunique().compute(num_worker=16)
print("Distinct protein annotations:", distinct_annotations)


Distinct protein annotations: 9704


In [12]:

# 2. How many annotations does a protein have on average?
average_annotations = df.groupby('1').size().mean().compute(num_worker=16)
print("Average annotations per protein:", average_annotations)



Average annotations per protein: 18.63260779886712


In [13]:
# 3. What is the most common GO Term found?
go_terms = df['0'].str.split('|').explode(num_worker=16)
most_common_go_term = go_terms.value_counts().nlargest(1).compute().index[0]
print("Most common GO Term:", most_common_go_term)


Most common GO Term: gi


In [15]:
# 4. What is the average size of an InterPRO feature found in the dataset?
df['FeatureSize'] = df['7'].astype(int) - df['6'].astype(int)
average_feature_size = df['FeatureSize'].mean().compute(num_worker=16)
print("Average size of InterPRO feature:", average_feature_size)


Average size of InterPRO feature: 131.73178916966685


5-if we choose column 11: I think cplumn 11 would be correct .

What are the top 10 most common InterPRO features?

top_10_interpro_features = df['11'].value_counts().nlargest(10).compute(num_worker=16)

print("Top 10 most common InterPRO features:")

print(top_10_interpro_features) the result would be:

Top 10 most common InterPRO features:

"-"                 885695

IPR027417     18306

IPR003439      6926

IPR036388      6293

IPR002347      6113

IPR036259      4678

IPR003593      4572

IPR000182      4425

IPR000515      4357

IPR036390      4338

Name: InterPRO, dtype: int64

In [16]:
# 5. What are the top 10 most common InterPRO features?
top_10_interpro_features = df['1'].value_counts().nlargest(10).compute(num_worker=16)
print("Top 10 most common InterPRO features:")
print(top_10_interpro_features)


Top 10 most common InterPRO features:
1
94ea27fb4472c582a7da936fe00ec0e6    754
1df9d779279633f438bfd41efb634b79    744
a5a05c1c3ba548d9d50936d7322c76a2    720
f041ced4ff981a17f11b868235c3ff07    676
193c4602f60c756b2c666286c2f57e29    667
cf1e8b5bebfc95161e321eb0aabaf5f3    646
f6c2ac3eac5236e1ed0b29843de44aea    620
e4218339be02b7550e1b08f34391ace1    600
61a3c4ca03785def2693a12e00d6f10e    595
4ed68ef068f96e5c52e13ffd4f6c4161    594
Name: count, dtype: int64


In [17]:
# 6. If you select InterPRO features that are almost the same size as the protein itself, what is the top 10 then?
protein_size = df['2'].astype(int)
similar_size_threshold = 0.9  # 90-100% similarity
similar_size_features = df[abs(df['FeatureSize'] - protein_size) / protein_size <= similar_size_threshold]
top_10_similar_size_features = similar_size_features['1'].value_counts().nlargest(10).compute(num_worker=16)
print("Top 10 most common InterPRO features with similar size:")
print(top_10_similar_size_features)

Top 10 most common InterPRO features with similar size:
1
cf1e8b5bebfc95161e321eb0aabaf5f3    544
1df9d779279633f438bfd41efb634b79    527
da80a6f8af08c7570750a95ef56c673e    510
f6c2ac3eac5236e1ed0b29843de44aea    496
122c52b97e2cada8071572a9d4cae8d9    480
b6963ad52c5360a9a725016ca58df5b7    476
ecf94cf431c6c2a45007e600117fa77d    448
f2c6294f06ff53f6066df6d707107931    442
78db5b49e42e410c45833186184aeff9    440
60c998a951d983add1bcb1e44e9f5f25    437
Name: count, dtype: int64


In [38]:
# 7. If you look at those features which also have textual annotation, what is the top 10 most common word found in that annotation?
from collections import Counter

text_annotations = df['3'] + ' ' + df['4'] + ' ' + df['5'] + df['11'] + df['12']
text_annotations = text_annotations.str.lower().str.replace(r'[^a-zA-Z0-9\s]', '').str.replace(r'\s+', ' ')

# Count the frequency of each word
word_counts = Counter(word for annotation in text_annotations for word in annotation.split())

# Get the top 10 most common words
top_10_words = word_counts.most_common(10)

# Print the results
for word, count in top_10_words:
    print(word, count)



the 1772071
protein 1615486
of 1392880
phobius 1323498
a 1303154
region 1293038
in 1140534
predicted 1135136
to 1134421
membrane-bound 1133013


In [35]:
# 8. What is the top 10 least common word found in that annotation?

top_10_least_common_words = word_counts.most_common()[:-11:-1]

for word, count in top_10_least_common_words:
    print(word, count)

rmtype1_s_sma198orf994p-trd2-cr2_like 1
cd17494 1
ipr019057 1
pf09553 1
anf00014 1
ipr040871 1
pf17914 1
(cmo5u34)-methyltransferase 1
tigr00740: 1
tigr00740 1


In [39]:
# 9. Combining answers for Q6 and Q7, what are the 10 most common words found for the largest InterPRO features?
from collections import Counter
# Select InterPRO features that are almost the same size as the protein itself
protein_size = df['2'].astype(int)
similar_size_threshold = 0.9  # 90-100% similarity
similar_size_features = df[abs(df['FeatureSize'] - protein_size) / protein_size <= similar_size_threshold]

# Get the textual annotation columns for the selected features
text_annotations = similar_size_features['3'] + ' ' + similar_size_features['4'] + ' ' + similar_size_features['5'] + similar_size_features['11'] + similar_size_features['12']
text_annotations = text_annotations.str.lower().str.replace(r'[^a-zA-Z0-9\s]', '').str.replace(r'\s+', ' ')

# Count the frequency of each word
word_counts = Counter(word for annotation in text_annotations for word in annotation.split())

# Get the top 10 most common words
top_10_words = word_counts.most_common(10)

# Print the results
for word, count in top_10_words:
    print(word, count)


domain 749332
superfamily 655420
protein 638130
panther 476527
pfam 443002
gene3d 423190
the 387517
of 350416
a 265586
phobius 264129


In [43]:
# What is the coefficient of correlation between the size of the protein and the number of features found?


# Calculate the coefficient of correlation between protein size and number of features
coefficient_of_correlation = df['2'].astype(int).corr(df['7'].astype(int) - df['6'].astype(int))

# Compute the result
coefficient_of_correlation_result = coefficient_of_correlation.compute(num_worker=16)

# Print the result
print("Coefficient of correlation:", coefficient_of_correlation_result)



We're assuming that the indices of each dataframes are 
 aligned. This assumption is not generally safe.


Coefficient of correlation: 0.17766055482398543


In [48]:
names=["protein_annotation", "protein_id", "protein_size", "feature_type", "feature_id", "feature_description", "start_position", "end_position", "e_value", "status", "date", "interpro_feature_id", "interpro_description", "textual_annotation_1", "textual_annotation_2"])



Index(['gi|29898682|gb|AAP11954.1|', '92d1264e347e149248231cb9b649388c',
       'TIGRFAM', 'TIGR03882',
       'cyclo_dehyd_2: bacteriocin biosynthesis cyclodehydratase domain', '2',
       '131', '1.6E-21', 'T', '25-04-2022', 'IPR022291',
       'Bacteriocin biosynthesis, cyclodehydratase domain', '-', '-.1',
       'FeatureSize'],
      dtype='object')
