In [3]:
# import libaries
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType,FloatType
import pyspark.sql.functions as f
import pandas as pd
import os

In [9]:
class PSTool:
    def __init__(self):
        print('Creating output folder')
        if os.path.exists('output'):
            pass
        else:
            os.makedirs('output')

    def pyspark_session(self, host_location):
        """
        Creates and returns spark session object
        """
        print('Starting session')
        sc = SparkContext(host_location)  # Create spark context
        spark = SparkSession(sc)  # Create session
        return spark

    def file_loader(self, path, delim, spark_obj, schema):
        print('Loading in file')
        data = spark_obj.read.options(delimiter=delim).option("header","False").csv(path, schema=schema)
        
        print('File loaded')
        return data

    def get_questions(self, df):
        pass

if __name__ == "__main__":
    pstool = PSTool()  # Instanciate object
    spk = pstool.pyspark_session('local[16]')  # start session
    # load data
#     path = '/data/dataprocessing/interproscan/all_bacilli.tsv'
    path = '/data/dataprocessing/interproscan/all_bacilli.tsv'
    schema = StructType([
        StructField("Protein_accession", StringType(), True),
        StructField("Sequence_MD5_digest", StringType(), True),
        StructField("Sequence_length", IntegerType(), True),
        StructField("Analysis", StringType(), True),
        StructField("Signature_accession", StringType(), True),
        StructField("Signature_description", StringType(), True),
        StructField("Start_location", IntegerType(), True),
        StructField("Stop_location", IntegerType(), True),
        StructField("Score", FloatType(), True),
        StructField("Status", StringType(), True),
        StructField("Date", StringType(), True),
        StructField("InterPro_annotations_accession", StringType(), True),
        StructField("InterPro_annotations_description", StringType(), True),
        StructField("GO_annotations", StringType(), True),
        StructField("Pathways_annotations", StringType(), True)])
    
    df = pstool.file_loader(path, '\t', spk, schema)
#     pstool.get_questions(df)
#     print('Closing spark session')
#     spk.sparkContext.stop()
    df.printSchema()  # Shows column names and some info

Creating output folder
Starting session
Loading in file
File loaded


### Data cleaning
The "function" of the protein which is the "class" your model should predict is defined as the InterPRO number which covers:
- ">" 90% of the protein's sequenc
- Covers the largest length of the sequence

As we are to train on the interpro number:
- Remove any rows without interpro number


In [11]:
# remove rows that do not have Interpro number
print(df.select('InterPro_annotations_accession').distinct().count())
IPRO_filt = df.filter(df["InterPro_annotations_accession"] != '-')
print(IPRO_filt.select('InterPro_annotations_accession').distinct().count())

# check amount of rows left. 
print('len:', len(df.columns))
print('count:' , df.count())
# check if the columns are propperly loaded in. 
df_sizes = IPRO_filt.withColumn('perc', abs(df.Start_location - df.Stop_location) / df.Sequence_length).sort('perc')
for i in df_sizes.columns:
    print(df_sizes.select(i).show(5))



len: 15


                                                                                

count: 4200591


                                                                                

+--------------------+
|   Protein_accession|
+--------------------+
|gi|510143242|gb|A...|
|gi|510143242|gb|A...|
|gi|510143242|gb|A...|
|gi|510143242|gb|A...|
|gi|510143242|gb|A...|
+--------------------+
only showing top 5 rows

None


                                                                                

+--------------------+
| Sequence_MD5_digest|
+--------------------+
|d6f8e49a4de47c68d...|
|d6f8e49a4de47c68d...|
|d6f8e49a4de47c68d...|
|d6f8e49a4de47c68d...|
|d6f8e49a4de47c68d...|
+--------------------+
only showing top 5 rows

None


                                                                                

+---------------+
|Sequence_length|
+---------------+
|           6359|
|           6359|
|           6359|
|           6359|
|           6359|
+---------------+
only showing top 5 rows

None


                                                                                

+---------------+
|       Analysis|
+---------------+
|ProSitePatterns|
|ProSitePatterns|
|ProSitePatterns|
|ProSitePatterns|
|ProSitePatterns|
+---------------+
only showing top 5 rows

None


                                                                                

+-------------------+
|Signature_accession|
+-------------------+
|            PS00455|
|            PS00455|
|            PS00455|
|            PS00455|
|            PS00455|
+-------------------+
only showing top 5 rows

None


                                                                                

+---------------------+
|Signature_description|
+---------------------+
| Putative AMP-bind...|
| Putative AMP-bind...|
| Putative AMP-bind...|
| Putative AMP-bind...|
| Putative AMP-bind...|
+---------------------+
only showing top 5 rows

None


                                                                                

+--------------+
|Start_location|
+--------------+
|           604|
|          5690|
|          4177|
|          1645|
|          3144|
+--------------+
only showing top 5 rows

None


                                                                                

+-------------+
|Stop_location|
+-------------+
|          615|
|         5701|
|         1656|
|         4188|
|         3155|
+-------------+
only showing top 5 rows

None


                                                                                

+-----+
|Score|
+-----+
| null|
| null|
| null|
| null|
| null|
+-----+
only showing top 5 rows

None


                                                                                

+------+
|Status|
+------+
|     T|
|     T|
|     T|
|     T|
|     T|
+------+
only showing top 5 rows

None


                                                                                

+----------+
|      Date|
+----------+
|25-04-2022|
|25-04-2022|
|25-04-2022|
|25-04-2022|
|25-04-2022|
+----------+
only showing top 5 rows

None


                                                                                

+------------------------------+
|InterPro_annotations_accession|
+------------------------------+
|                     IPR020845|
|                     IPR020845|
|                     IPR020845|
|                     IPR020845|
|                     IPR020845|
+------------------------------+
only showing top 5 rows

None


                                                                                

+--------------------------------+
|InterPro_annotations_description|
+--------------------------------+
|            AMP-binding, cons...|
|            AMP-binding, cons...|
|            AMP-binding, cons...|
|            AMP-binding, cons...|
|            AMP-binding, cons...|
+--------------------------------+
only showing top 5 rows

None


                                                                                

+--------------+
|GO_annotations|
+--------------+
|             -|
|             -|
|             -|
|             -|
|             -|
+--------------+
only showing top 5 rows

None


                                                                                

+--------------------+
|Pathways_annotations|
+--------------------+
|MetaCyc: PWY-1061...|
|MetaCyc: PWY-1061...|
|MetaCyc: PWY-1061...|
|MetaCyc: PWY-1061...|
|MetaCyc: PWY-1061...|
+--------------------+
only showing top 5 rows

None




+--------------------+
|                perc|
+--------------------+
|0.001729831734549...|
|0.001729831734549...|
|0.001729831734549...|
|0.001729831734549...|
|0.001729831734549...|
+--------------------+
only showing top 5 rows

None


                                                                                

### modeling

In [16]:
import sklearn



                                                                                

9703