In [19]:
import findspark
# /opt/manual/spark: this is SPARK_HOME path
findspark.init("/opt/manual/spark")

In [20]:
from pyspark.sql import SparkSession, functions as F
from pyspark import SparkConf, SparkContext

# Downlad Spark Hadoop AWS connectors

In [21]:
# Download following jars to /opt/manual/spark/jars

# wget -P /opt/manual/spark/jars/ \
# https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.2.0/hadoop-aws-3.2.0.jar

# wget -P /opt/manual/spark/jars/ \
# https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.375/aws-java-sdk-bundle-1.11.375.jar

In [22]:
spark = SparkSession.builder \
.master("local[2]") \
.appName("Spark AWS S3") \
.getOrCreate()

In [23]:
spark.version

'3.1.1'

# Prepare AWS S3 Credentials

    cd ~
    sudo yum -y install unzip
    curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
    unzip awscliv2.zip
    sudo ./aws/install

    aws configure
    AWS Access Key ID [None]: <your key here>
    AWS Secret Access Key [None]: <your secret key here>
    Default region name [None]: eu-central-1
    Default output format [None]: json

# Read AWS S3 Credentials

In [24]:
import configparser

accessKeyId = ''
secretAccessKey = ''

config = configparser.RawConfigParser()

config.read('/home/train/.aws/credentials')
config.sections()
accessKeyId = config.get('default', 'aws_access_key_id') ##configparser.NoSectionError: No section: 'default'
secretAccessKey = config.get('default', 'aws_secret_access_key') ##configparser.NoSectionError: No section: 'default'

# S3 related Spark configs

In [25]:
def load_config(spark_context: SparkContext):
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.access.key', accessKeyId)
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.secret.key', secretAccessKey)
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.path.style.access', 'true')
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.endpoint', 's3.amazonaws.com')

In [26]:
load_config(spark.sparkContext)

# Read data from local

In [27]:
# ! wget -P ~/datasets https://raw.githubusercontent.com/erkansirin78/datasets/master/simple_data.csv

In [28]:
# Upload it to s3 bucket from web console

In [29]:
df = spark.read \
.option("inferSchema",True) \
.option("header", True) \
.csv('s3a://vbo-de-input/simple_data.csv')

In [30]:
df.show(3)

+------+-----+---+--------+--------+-----------+
|sirano| isim|yas|  meslek|   sehir|aylik_gelir|
+------+-----+---+--------+--------+-----------+
|     1|Cemal| 35|    Isci|  Ankara|       3500|
|     2|Ceyda| 42|   Memur| Kayseri|       4200|
|     3|Timur| 30|Müzisyen|Istanbul|       9000|
+------+-----+---+--------+--------+-----------+
only showing top 3 rows



In [31]:
df2 = df.withColumn("salary_dol", F.col("aylik_gelir") / 13.5)

df2.show()

+------+--------+---+-----------+--------+-----------+------------------+
|sirano|    isim|yas|     meslek|   sehir|aylik_gelir|        salary_dol|
+------+--------+---+-----------+--------+-----------+------------------+
|     1|   Cemal| 35|       Isci|  Ankara|       3500|259.25925925925924|
|     2|   Ceyda| 42|      Memur| Kayseri|       4200| 311.1111111111111|
|     3|   Timur| 30|   Müzisyen|Istanbul|       9000| 666.6666666666666|
|     4|   Burcu| 29|Pazarlamaci|  Ankara|       4200| 311.1111111111111|
|     5| Yasemin| 23|       null|   Bursa|       4800|355.55555555555554|
|     6|     Ali| 33|      Memur|  Ankara|       4250|314.81481481481484|
|     7|   Dilek| 29|Pazarlamaci|Istanbul|       7300| 540.7407407407408|
|     8|   Murat| 31|   Müzisyen|Istanbul|      12000| 888.8888888888889|
|     9|   Ahmet| 33|     Doktor|  Ankara|      18000|1333.3333333333333|
|    10|Muhittin| 46|     Berber|Istanbul|      12000| 888.8888888888889|
|    11|Hicaziye| 47| Tuhafiyeci|  Ank

# Write to AWS S3

In [32]:
df2.write.format('csv').option('header','true') \
.save('s3a://vbo-de-output/simple_data', mode='overwrite')

2022-10-24 22:26:33,799 WARN commit.AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
2022-10-24 22:26:35,136 WARN commit.AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
                                                                                

# read from AWS S3

In [33]:
df_s3 = spark.read.option('header','true') \
.csv('s3a://vbo-de-output/simple_data')

In [34]:
df_s3.show(3)

+------+-----+---+--------+--------+-----------+------------------+
|sirano| isim|yas|  meslek|   sehir|aylik_gelir|        salary_dol|
+------+-----+---+--------+--------+-----------+------------------+
|     1|Cemal| 35|    Isci|  Ankara|       3500|259.25925925925924|
|     2|Ceyda| 42|   Memur| Kayseri|       4200| 311.1111111111111|
|     3|Timur| 30|Müzisyen|Istanbul|       9000| 666.6666666666666|
+------+-----+---+--------+--------+-----------+------------------+
only showing top 3 rows



In [35]:
spark.stop()

In [None]:
# Go to S3 web ui and check data