# Use `AWS Gluestudio` to load, join and rewrite dataframe into s3
### Boilerplate import `Glue PySpark` libraries

In [None]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
It looks like there is a newer version of the kernel available. The latest version is 0.31 and you have 0.30 installed.
Please run `pip install --upgrade aws-glue-sessions` to upgrade your kernel
Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::498938378154:role/service-role/AWSGlueServiceRole-gg
Attempting to use existing AssumeRole session credentials.
Trying to create a Glue session for the kernel.
Worker Type: G.1X
Number of Workers: 5
Session ID: 9b5ed5d3-39f5-460a-81f9-d8841e200c2f
Applying the following default arguments:
--glue_kernel_version 0.30
--enable-glue-datacatalog true
Waiting for session 9b5ed5d3-39f5-460a-81f9-d8841e2

### Build a glueContext

In [1]:
glueContext = GlueContext(SparkContext.getOrCreate())




Read parquet files into dynamic dataframe

In [2]:
up_features = glueContext.create_dynamic_frame_from_options(connection_type = "parquet", connection_options = {"paths":["s3://imba-alan/features/up_features_db/"]})
prd_features = glueContext.create_dynamic_frame_from_options(connection_type = "parquet", connection_options = {"paths":["s3://imba-alan/features/prd_feature_db/"]})
user_features_1 = glueContext.create_dynamic_frame_from_options(connection_type = "parquet", connection_options = {"paths":["s3://imba-alan/features/user_features_1_db/"]})
user_features_2 = glueContext.create_dynamic_frame_from_options(connection_type = "parquet", connection_options = {"paths":["s3://imba-alan/features/user_features_2_db/"]})




### Join the four tables (dataframes)

Join the user features first

In [3]:
users = Join.apply(user_features_1.rename_field('user_id','user_id_temp'), user_features_2, 'user_id_temp', 'user_id').drop_fields(['user_id_temp'])




Join all together

In [4]:
features = Join.apply(Join.apply(users.rename_field('user_id','user_id_temp'),up_features,'user_id_temp','user_id').drop_fields(['user_id_temp']),
           prd_features.rename_field('product_id','product_id_temp'),'product_id','product_id_temp').drop_fields(['product_id_temp'])




In [6]:
print('Total rows: ' + str(features.count()))
features.printSchema()

Total rows: 13307953
root
|-- product_id: long
|-- up_orders: int
|-- user_mean_days_since_prior: double
|-- user_period: double
|-- user_distinct_products: int
|-- prod_second_orders: int
|-- prod_reorders: int
|-- user_reorder_ratio: double
|-- user_total_products: int
|-- up_average_cart_position: double
|-- up_first_order: int
|-- order_number_reordered_sum: double
|-- user_orders: int
|-- prod_orders: int
|-- up_last_order: int
|-- reordered_sum: double
|-- prod_first_orders: int
|-- user_id: long


In [7]:
features_s = features.toDF().repartition(1)
features_s.write.csv('s3://imba-alan/features/feature_in_one/')


