# Initialization

In [None]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import *
from awsglue.dynamicframe import DynamicFrame

In [2]:
# Set up a single GlueContext
sc = SparkContext.getOrCreate()

glueContext = GlueContext(sc)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
from pyspark.sql.types import StringType
import boto3
import csv
import codecs

client = boto3.client("s3")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Processing Job

In [4]:
# Initialize job
job = Job(glueContext)
job.init('job')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
# Name of database
database = ""

# Subset of data (e.g., train, validation, or test)
data_subset = "train"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# Load data and convert it to datafram
row_data = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = data_subset)
row_data_df = DynamicFrame.toDF(row_data)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
row_data_df.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------+--------------------+----------------+--------------------+--------------------+
|          col0|                col1|            col2|                col3|                col4|
+--------------+--------------------+----------------+--------------------+--------------------+
|     family_id|       sequence_name|family_accession|    aligned_sequence|            sequence|
|  zf-Tim10_DDP|  N1QB11_PSEFD/15-76|      PF02953.15|..RMEKKQMKDFMNMYS...|RMEKKQMKDFMNMYSNL...|
| DNA_primase_S|A8XA78_CAEBR/105-345|      PF01896.19|FDID..LTDYDNIRNCC...|FDIDLTDYDNIRNCCKE...|
| Col_cuticle_N|   A8XBM5_CAEBR/9-56|      PF01484.17|ASAAILSGATIVGCLFF...|ASAAILSGATIVGCLFF...|
|       GST_C_3|W4XBU3_STRPU/120-207|       PF14497.6|KD..................|KDKLKESLPKTVNPILL...|
|Ada_Zn_binding|   E8U5K2_DEIML/9-73|      PF02805.16|DRWQAVVQRE...AAQ....|DRWQAVVQREAAQDGLF...|
|       Flg_new|M4RB64_9BIFI/796-861|      PF09479.10|YTLSFD.A...N..G.....|YTLSFDANGGSVSPGSK...|
|       DUF4360| A8NVM1_COPC7/

## Read Files
Reading frequency and class files.

In [8]:
bucket = ""

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
data = client.get_object(Bucket=bucket, Key="freq_df.csv")
freq_dic = {}
index = 0

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
for row in csv.DictReader(codecs.getreader("utf-8")(data["Body"])):
    freq_dic[row['feature']] = index
    index += 1

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
classes = client.get_object(Bucket=bucket, Key="dict_class.csv")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
for dict_class in csv.DictReader(codecs.getreader("utf-8")(classes["Body"])):
    pass

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Methods

In [13]:
def family_accession_to_id(family_accession):
    """
    This method takes family accessions and returns their indices.
    """
    return int(dict_class[family_accession])

family_accession_to_id_f = udf(lambda x: family_accession_to_id(x), StringType())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
def sequence_to_ID(sequence):
    """
    This method converts letters in a sequence to their IDs.
    Inputes:
        1. A sequence of letters.
        
    Outputs:
        1. A list of IDs.
    """
    return list(map(char_to_ID, list(sequence)))

def char_to_ID(char):
    """
    This method takes letters and returns their indices + 1.
    """
    return freq_dic[char.lower()] + 1

sequence_to_ID_f = udf(lambda x: sequence_to_ID(x), StringType())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Processing

In [15]:
# Extract family_accession and sequence columns and drop other columns
extracted_df = row_data_df.drop('col0').drop('col1').drop('col3')

# Extract headers and remove them
family_accession_header, sequence_header = extracted_df.first()[0], extracted_df.first()[1]
extracted_df = extracted_df.filter(~col("col2").contains(family_accession_header))
extracted_df = extracted_df.filter(~col("col4").contains(sequence_header))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
extracted_df.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+--------------------+
|      col2|                col4|
+----------+--------------------+
|PF02953.15|RMEKKQMKDFMNMYSNL...|
|PF01896.19|FDIDLTDYDNIRNCCKE...|
|PF01484.17|ASAAILSGATIVGCLFF...|
| PF14497.6|KDKLKESLPKTVNPILL...|
|PF02805.16|DRWQAVVQREAAQDGLF...|
|PF09479.10|YTLSFDANGGSVSPGSK...|
| PF14273.6|AELKLKSVGAIGTGCPP...|
|PF09186.11|INIDYDLLGKVQYVCGQ...|
|PF00447.17|FVNKLWNMLNDPINQDM...|
| PF12704.7|TFLTMLGIIIGTASVVC...|
+----------+--------------------+
only showing top 10 rows

### Family Accession

In [17]:
extracted_df = extracted_df.withColumn("family_accession", family_accession_to_id_f(extracted_df["col2"])).drop("col2")
extracted_df.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+----------------+
|                col4|family_accession|
+--------------------+----------------+
|RMEKKQMKDFMNMYSNL...|             645|
|FDIDLTDYDNIRNCCKE...|            9741|
|ASAAILSGATIVGCLFF...|              10|
|KDKLKESLPKTVNPILL...|            1859|
|DRWQAVVQREAAQDGLF...|            1241|
|YTLSFDANGGSVSPGSK...|            4035|
|AELKLKSVGAIGTGCPP...|            4239|
|INIDYDLLGKVQYVCGQ...|            5219|
|FVNKLWNMLNDPINQDM...|             181|
|TFLTMLGIIIGTASVVC...|            1380|
+--------------------+----------------+
only showing top 10 rows

### Sequence

In [18]:
extracted_df = extracted_df.withColumn("sequence", sequence_to_ID_f(extracted_df["col4"])).drop("col4")
extracted_df.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+--------------------+
|family_accession|            sequence|
+----------------+--------------------+
|             645|[8, 17, 5, 10, 10...|
|            9741|[13, 9, 7, 9, 1, ...|
|              10|[2, 6, 2, 2, 7, 1...|
|            1859|[10, 9, 10, 1, 10...|
|            1241|[9, 8, 20, 15, 2,...|
|            4035|[16, 11, 1, 6, 13...|
|            4239|[2, 5, 1, 10, 1, ...|
|            5219|[7, 14, 7, 9, 16,...|
|             181|[13, 3, 14, 10, 1...|
|            1380|[11, 13, 1, 11, 1...|
+----------------+--------------------+
only showing top 10 rows

### Save 

In [19]:
# Bucket for processed data
bucket_processed = ""
s3_path = "s3://" + bucket_processed + "/" + data_subset

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
extracted_df.write.format('csv').option('header',True).mode('overwrite').option('sep',',').save(s3_path)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
job.commit()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…