In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StringType, StructType

In [2]:
spark = SparkSession.builder \
    .appName("SparkToMinio") \
    .config("spark.hadoop.fs.s3a.access.key", 'test-user') \
    .config("spark.hadoop.fs.s3a.secret.key", 'test-password') \
    .config("spark.hadoop.fs.s3a.endpoint", 'minio:9000') \
    .config("spark.hadoop.fs.s3a.path.style.access", True) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", False) \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2") \
    .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider') \
    .getOrCreate()

In [3]:
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", StringType(), True)
])


In [4]:
data = [
    ("John", "30"), 
    ("Steve", "25"), 
    ("Bill", "40"), 
    ("Donald", "45"), 
    ("Jenny", "23"), 
    ("Lucas", "27"), 
    ("Emma", "35"), 
    ("Grace", "28"), 
    ("Liam", "32"), 
    ("Claire", "29")
    
]

df = spark.createDataFrame(data, schema)



In [11]:
bucket_name = "testing-spark"
file_path = f"s3a://{bucket_name}/my-folder"

In [12]:
# send the dataframe to minio as a csv file
df.coalesce(1).write \
    .format("csv") \
    .mode("overwrite") \
    .option("header", "false") \
    .save(file_path)

In [13]:
spark.stop()

In [14]:
print(file_path)

s3a://testing-spark/my-folder
