In [1]:
from azure.storage.blob import BlockBlobService
from pyspark            import SparkConf,    SparkContext
from pyspark.sql        import SparkSession, SQLContext
from pyspark.sql.types  import *

storageAccountName = "dvbatch"
storageKey         = "d/xdWGVlp4DYi7JCvjEuYW/OaeSBEupMpG/5SlyE7CheMA0s1rHAByjxQ3zSemgvCI70BcyDDpT5s9K1BVMO3g=="
containerName      = "output"
file               = "fixed-width-1.txt"

# Establish connection with the blob storage account
blobService = BlockBlobService(account_name=storageAccountName,
                               account_key =storageKey
                               )

In [2]:
# Create spark session
spark = SparkSession.builder.master("local").appName("fixed-width"                          )\
                                            .config("spark.some.config.option", "some-value")\
                                            .getOrCreate()

In [23]:
# Read in fixed-width text file into DataFrame
from pyspark.sql import functions as f

df     = spark.read.text(file)

# Remove header
header = df.first()[0]
df     = df.filter(~f.col("value").contains(header))
df.show(5,False)

+----------------------------------------------------+
|value                                               |
+----------------------------------------------------+
| 16524  01  3930621977  TXNPUES                     |
|191675  01  2368183100  OUNHQEX                     |
|191667  01  3714468136  GHAKASC                     |
|191673  01  2632703881  PAHFSAP                     |
| 80495  01  2766389794  XDZANTV                     |
+----------------------------------------------------+
only showing top 5 rows



In [24]:
# Take the fixed width file and split into distinct columns
sorted_df = df.select(
    df.value.substr( 1,  6).alias('Entry'      ),
    df.value.substr( 8,  3).alias('Per'        ),
    df.value.substr(12, 11).alias('Account'    ),
    df.value.substr(24, 11).alias('Description'),
)
sorted_df.show()
sorted_df.printSchema()

+------+---+-----------+-----------+
| Entry|Per|    Account|Description|
+------+---+-----------+-----------+
| 16524| 01| 3930621977| TXNPUES   |
|191675| 01| 2368183100| OUNHQEX   |
|191667| 01| 3714468136| GHAKASC   |
|191673| 01| 2632703881| PAHFSAP   |
| 80495| 01| 2766389794| XDZANTV   |
| 80507| 01| 4609266335| BWWYEZL   |
| 80509| 01| 1092717420| QJYPKVO   |
| 80497| 01| 3386366766| SOQLCMU   |
|191669| 01| 5905893739| FYIWNKA   |
|191671| 01| 2749355876|   CBMJTLP |
+------+---+-----------+-----------+

root
 |-- Entry: string (nullable = true)
 |-- Per: string (nullable = true)
 |-- Account: string (nullable = true)
 |-- Description: string (nullable = true)



In [25]:
from pyspark.sql import functions as f

cast = [f.col('Entry')      .cast('long'  ),
        f.col('Per'  )      .cast('int'   ),
        f.col('Account')    .cast('long'  ),
        f.col('Description').cast('string')
       ]              

In [26]:
sorted_df = sorted_df.select(cast)
sorted_df.show()
sorted_df.printSchema()

+------+----+-------+-----------+
| Entry| Per|Account|Description|
+------+----+-------+-----------+
|  null|null|   null| TXNPUES   |
|191675|null|   null| OUNHQEX   |
|191667|null|   null| GHAKASC   |
|191673|null|   null| PAHFSAP   |
|  null|null|   null| XDZANTV   |
|  null|null|   null| BWWYEZL   |
|  null|null|   null| QJYPKVO   |
|  null|null|   null| SOQLCMU   |
|191669|null|   null| FYIWNKA   |
|191671|null|   null|   CBMJTLP |
+------+----+-------+-----------+

root
 |-- Entry: long (nullable = true)
 |-- Per: integer (nullable = true)
 |-- Account: long (nullable = true)
 |-- Description: string (nullable = true)



In [8]:
"""
Time stamp to file path to prevent saving over orignial file.
"""
def createTimeStamp(): 
    from datetime import datetime

    # datetime object containing current date and time
    now = datetime.now()

    # /dd-mm-YY_H:M
    dt_string = now.strftime("/%d-%m-%Y_%H-%M")    

    return dt_string

In [9]:
# Create an output folder with a timestamp to prevent overwriting files
output_dir   = "output" + createTimeStamp() 
print(output_dir)

output/13-05-2020_19-36


In [10]:
# Make directory and write files to it
import os
from   os import path

try:
    sorted_df.write.parquet(output_dir)
    files_in_dir = output_dir +"/*"
    
except FileExistsError:
    print("Path exists -- skipping")
    print(output_dir)
    pass

AnalysisException: 'Found duplicate column(s) when inserting into file:/home/nbuser/library/output/13-05-2020_19-36: `entry`;'

In [None]:
# Print files we just saved with Spark
import glob
print(glob.glob(files_in_dir))

In [None]:
# Write/upload files to blob storage
for file in glob.glob(files_in_dir):
    print(file)
    blobService.create_blob_from_path(containerName, file, file)