# Initialization

In [1]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import *
from awsglue.dynamicframe import DynamicFrame

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
3,application_1606413863529_0004,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
# Set up a single GlueContext
sc = SparkContext.getOrCreate()

glueContext = GlueContext(sc)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
from pyspark.sql.types import StringType
import boto3
import csv
import codecs

client = boto3.client("s3")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Processing Job

In [4]:
# Initialize job
job = Job(glueContext)
job.init('job')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
# Name of database
database = ""

# Subset of data (e.g., train, validation, or test)
data_subset = "train"

In [None]:
# Load data and convert it to datafram
row_data = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = data_subset)
row_data_df = DynamicFrame.toDF(row_data)

In [5]:
row_data_df.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+--------------------+----------------+--------------------+--------------------+
|        col0|                col1|            col2|                col3|                col4|
+------------+--------------------+----------------+--------------------+--------------------+
|   family_id|       sequence_name|family_accession|    aligned_sequence|            sequence|
| GMC_oxred_C|A4WZS5_RHOS5/416-539|      PF05199.13|PHPE.SRIRLST.RRDA...|PHPESRIRLSTRRDAHG...|
|     DUF2887|  K9QI92_9NOSO/3-203|       PF11103.8|RDSIYYQIFKRFPALIF...|RDSIYYQIFKRFPALIF...|
|     zf-IS66|  Q92LC9_RHIME/32-75|       PF13005.7|.TCCPDCGG.E..LRLV...|TCCPDCGGELRLVGEDA...|
|Asp_decarbox|  X2GQZ4_9BACI/1-115|      PF02261.16|MLRMMMNSKIHRATVTE...|MLRMMMNSKIHRATVTE...|
|     Filamin|A7SQM3_NEMVE/342-439|      PF00630.19|TACPKQ.CTA....RGL...|TACPKQCTARGLGLKAA...|
|     DUF4131| Q6N5R7_RHOPA/70-226|       PF13567.6|DHEPVAWVAAATAAGFV...|DHEPVAWVAAATAAGFV...|
|       GGACT|A0A086WQ80_9VIBR/...|      PF06094.1

## Read Files
Reading frequency and class files.

In [None]:
bucket = ""

In [6]:
data = client.get_object(Bucket=bucket, Key="freq_df.csv")
freq_dic = {}
index = 0

for row in csv.DictReader(codecs.getreader("utf-8")(data["Body"])):
    freq_dic[row['feature']] = index
    index += 1

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
classes = client.get_object(Bucket=bucket, Key="dict_class.csv")

for dict_class in csv.DictReader(codecs.getreader("utf-8")(classes["Body"])):
    pass

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Methods

In [8]:
def family_accession_to_id(family_accession):
    """
    This method takes family accessions and returns their indices.
    """
    return int(dict_class[family_accession])

family_accession_to_id_f = udf(lambda x: family_accession_to_id(x), StringType())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
def sequence_to_ID(sequence):
    """
    This method converts letters in a sequence to their IDs.
    Inputes:
        1. A sequence of letters.
        
    Outputs:
        1. A list of IDs.
    """
    return list(map(char_to_ID, list(sequence)))

def char_to_ID(char):
    """
    This method takes letters and returns their indices + 1.
    """
    return freq_dic[char.lower()] + 1

sequence_to_ID_f = udf(lambda x: sequence_to_ID(x), StringType())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Processing

In [None]:
# Extract family_accession and sequence columns and drop other columns
extracted_df = row_data_df.drop('col0').drop('col1').drop('col3')

# Extract headers and remove them
family_accession_header, sequence_header = extracted_df.first()[0], extracted_df.first()[1]
extracted_df = extracted_df.filter(~col("col2").contains(family_accession_header))
extracted_df = extracted_df.filter(~col("col4").contains(sequence_header))

In [10]:
extracted_df.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+--------------------+
|      col2|                col4|
+----------+--------------------+
|PF05199.13|PHPESRIRLSTRRDAHG...|
| PF11103.8|RDSIYYQIFKRFPALIF...|
| PF13005.7|TCCPDCGGELRLVGEDA...|
|PF02261.16|MLRMMMNSKIHRATVTE...|
|PF00630.19|TACPKQCTARGLGLKAA...|
| PF13567.6|DHEPVAWVAAATAAGFV...|
|PF06094.12|LFVYGTLRQGESNHNFL...|
|PF06335.12|FTGQDFDVFRIEGLDQR...|
|PF06580.13|SEIKLLHAQVNPHFLFN...|
|PF01379.20|LRIATRKSPLALWQAEY...|
+----------+--------------------+
only showing top 10 rows

### Family Accession

In [11]:
extracted_df = extracted_df.withColumn("family_accession", family_accession_to_id_f(extracted_df["col2"])).drop("col2")
extracted_df.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+----------------+
|                col4|family_accession|
+--------------------+----------------+
|PHPESRIRLSTRRDAHG...|               0|
|RDSIYYQIFKRFPALIF...|               1|
|TCCPDCGGELRLVGEDA...|               2|
|MLRMMMNSKIHRATVTE...|               3|
|TACPKQCTARGLGLKAA...|               4|
|DHEPVAWVAAATAAGFV...|               5|
|LFVYGTLRQGESNHNFL...|               6|
|FTGQDFDVFRIEGLDQR...|               7|
|SEIKLLHAQVNPHFLFN...|               8|
|LRIATRKSPLALWQAEY...|               9|
+--------------------+----------------+
only showing top 10 rows

### Sequence

In [12]:
extracted_df = extracted_df.withColumn("sequence", sequence_to_ID_f(extracted_df["col4"])).drop("col4")
extracted_df.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+--------------------+
|family_accession|            sequence|
+----------------+--------------------+
|               0|[12, 18, 12, 5, 6...|
|               1|[8, 9, 6, 7, 16, ...|
|               2|[11, 19, 19, 12, ...|
|               3|[17, 1, 8, 17, 17...|
|               4|[11, 2, 19, 12, 1...|
|               5|[9, 18, 5, 12, 3,...|
|               6|[1, 13, 3, 16, 4,...|
|               7|[13, 11, 4, 15, 9...|
|               8|[6, 5, 7, 10, 1, ...|
|               9|[1, 8, 7, 2, 11, ...|
+----------------+--------------------+
only showing top 10 rows

### Save 

In [None]:
# Bucket for processed data
bucket_processed = ""
s3_path = "s3://" + bucket_processed + "/" + data_subset

In [13]:
extracted_df.write.format('csv').option('header',True).mode('overwrite').option('sep',',').save(s3_path)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
job.commit()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…