In [1]:
from azure.storage.blob import BlockBlobService
from pyspark            import SparkConf,    SparkContext
from pyspark.sql        import SparkSession, SQLContext
from pyspark.sql.types  import *

storageAccountName = "dvbatch"
storageKey         = "d/xdWGVlp4DYi7JCvjEuYW/OaeSBEupMpG/5SlyE7CheMA0s1rHAByjxQ3zSemgvCI70BcyDDpT5s9K1BVMO3g=="
containerName      = "output"
file               = "fixed-width-2.txt"

# Establish connection with the blob storage account
blobService = BlockBlobService(account_name=storageAccountName,
                               account_key =storageKey
                               )

In [2]:
# Create spark session
spark = SparkSession.builder.master("local").appName("fixed-width"                          )\
                                            .config("spark.some.config.option", "some-value")\
                                            .getOrCreate()
# Read in fixed-width text file
df = spark.read.option("header"     , "false")\
               .option("inferSchema", "false")\
               .text(file                    )

In [3]:
# Generate a schema for each of the columns
schema = StructType([StructField('col1', IntegerType(), True),
                     StructField('col2', IntegerType(), True),
                     StructField('col3', IntegerType(), True)])

In [4]:
# Take the fixed width file and split into 3 distinct columns
sorted_df = df.select(
    df.value.substr(1, 4).alias('col1'),
    df.value.substr(5, 4).alias('col2'),
    df.value.substr(8, 4).alias('col3'),
)

In [6]:
# Cast each column to int
from pyspark.sql import functions as f

casting  = [(f.col(col_name).cast("int"))
             .name(col_name) 
               for col_name in sorted_df.columns]

sorted_df = sorted_df.select(casting)

In [10]:
# Show results
sorted_df.show()
sorted_df.printSchema()

+----+----+----+
|col1|col2|col3|
+----+----+----+
|1234|5678|8135|
|1234|5678|8135|
|1234|5678|8135|
|1234|5678|8135|
|1234|5678|8135|
|1234|5678|8135|
|1234|5678|8135|
|1234|5678|8135|
|1234|5678|8135|
|1234|5678|8135|
|1234|5678|8135|
|1234|5678|8135|
|1234|5678|8135|
+----+----+----+

root
 |-- col1: integer (nullable = true)
 |-- col2: integer (nullable = true)
 |-- col3: integer (nullable = true)



In [11]:
"""
Time stamp to file path to prevent saving over orignial file.
"""
def createTimeStamp(): 
    from datetime import datetime

    # datetime object containing current date and time
    now = datetime.now()

    # /dd-mm-YY_H:M
    dt_string = now.strftime("/%d-%m-%Y_%H-%M")    

    return dt_string

In [12]:
# Create an output folder with a timestamp to prevent overwriting files
output_dir   = "output" + createTimeStamp() 
print(output_dir)

output/13-05-2020_13-23


In [13]:
# Make directory and write files to it
import os
from   os import path

try:
    sorted_df.write.parquet(output_dir)
    files_in_dir = output_dir +"/*"
    
except FileExistsError:
    print("Path exists -- skipping")
    print(output_dir)
    pass

In [14]:
# Print files we just saved with Spark
import glob
print(glob.glob(files_in_dir))

['output/13-05-2020_13-23/part-00000-edbb798d-2306-4d6f-829c-46c61e1b2191-c000.snappy.parquet', 'output/13-05-2020_13-23/_SUCCESS']


In [15]:
# Write/upload files to blob storage
for file in glob.glob(files_in_dir):
    print(file)
    blobService.create_blob_from_path(containerName, file, file)

output/13-05-2020_13-23/part-00000-edbb798d-2306-4d6f-829c-46c61e1b2191-c000.snappy.parquet
output/13-05-2020_13-23/_SUCCESS
