### Step 1: Set the data location and type

There are two ways to access Azure Blob storage: account keys and shared access signatures (SAS).

To get started, we need to set the location and type of the file.

In [0]:
storage_account_name = "devassignment"
storage_account_access_key = "8YpOLSWDJegnOVlvZzuFpwShdoAPmZpc5Ws4PTz4w6R7sN4WCD+9JgNTs00YgQTxjNfmWVokZ5AE+ASthmNG3g=="

In [0]:
file_location = "wasbs://capstone@devassignment.blob.core.windows.net/data/Raw/Iris.csv"
file_type = "csv"

In [0]:
spark.conf.set(
  "fs.azure.account.key."+storage_account_name+".blob.core.windows.net",
  storage_account_access_key)

### Step 2: Read the data

Now that we have specified our file metadata, we can create a DataFrame. Notice that we use an *option* to specify that we want to infer the schema from the file. We can also explicitly set this to a particular schema if we have one already.

First, let's create a DataFrame in Python.

In [0]:
df = spark.read.format(file_type).option("inferSchema", "true").option("header", "true").load(file_location)

In [0]:
df.describe()

Out[35]: DataFrame[summary: string, Id: string, SepalLengthCm: string, SepalWidthCm: string, PetalLengthCm: string, PetalWidthCm: string, Species: string]

In [0]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)



### Step 3: Data pre-processing

In [0]:
#Drop the rows with null values
data=df.na.drop()

In [0]:
data.show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [0]:
#Applying string indexer for the species column
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer

l=["Species"]
indexer = [
StringIndexer(inputCol=c, outputCol="{0}1".format(c))
for c in l
]

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import unix_timestamp

In [0]:
#Vector Assembler
va=VectorAssembler(inputCols=['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species1'],outputCol="features")

In [0]:
#Fit the vector assembler into the dataframe
pipeline = Pipeline(stages=indexer + [va])
df_tfm=pipeline.fit(data).transform(data)

In [0]:
df_tfm.show()

+---+-------------+------------+-------------+------------+-----------+--------+--------------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|Species1|            features|
+---+-------------+------------+-------------+------------+-----------+--------+--------------------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|     0.0|[1.0,5.1,3.5,1.4,...|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|     0.0|[2.0,4.9,3.0,1.4,...|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|     0.0|[3.0,4.7,3.2,1.3,...|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|     0.0|[4.0,4.6,3.1,1.5,...|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|     0.0|[5.0,5.0,3.6,1.4,...|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|     0.0|[6.0,5.4,3.9,1.7,...|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|     0.0|[7

In [0]:
dbutils.fs.mkdirs("wasbs://capstone@devassignment.blob.core.windows.net/" + "data/processed/")

output_folder = "wasbs://capstone@devassignment.blob.core.windows.net/" + "data/processed/iris_processed.parquet"

df_tfm.repartition(1).write.format("parquet") \
    .mode("overwrite") \
    .save(output_folder)