**Notebook to cleanup ECR data from ECR List on Sharepoint**

In [8]:
from pyspark.sql.functions import*
from pyspark.sql.types import StructType,StructField,StringType,ArrayType,IntegerType

df = spark.sql("SELECT * FROM sharepoint_ecr_list.ECR_List_Raw")
df = df.sort(df['ECR_#'].desc())
#display(df)

StatementMeta(, 3626fdbf-447e-4f95-bd71-2822da4a3877, 10, Finished, Available, Finished)

JSON schema for project and subproject

In [9]:
products_schema = ArrayType(
    StructType()
    .add(StructField('__id',StringType(),True))\
    .add(StructField('Project',StringType(),True))\
    .add(StructField('Subproject',StringType(),True))
)

StatementMeta(, 3626fdbf-447e-4f95-bd71-2822da4a3877, 11, Finished, Available, Finished)

In [10]:
added_schema = ArrayType(
    StructType()
    .add(StructField('__id',StringType(),True))\
    .add(StructField('Product',StringType(),True))\
    .add(StructField('Drawing',StringType(),True))\
    .add(StructField('Description',StringType(),True))\
    .add(StructField('Qty',IntegerType(),True))\
    .add(StructField('Prod',StringType(),True))
)

StatementMeta(, 3626fdbf-447e-4f95-bd71-2822da4a3877, 12, Finished, Available, Finished)

Selecting the required columns for data analysis

In [11]:
newdf = df.select(
    col('Title').alias('ECR#'),\
    col('Urgency_Code_Mech'),\
    explode(from_json(col('Products_Mech'),products_schema)).alias('Products_Mech'),\
    col('Products_Added_Mech').alias('Products_Added'),\
    col('Drawing_Mech'),\
    col('Stage_Mech'),\
    col('Created'),\
    col('Modified'),\
    col('Project_Leader_Sign_Mech'),\
    col('ECR_Completed_Sign_Mech'),\
    col('Review_Project_Leader_Sign_Mech'),\
    col('Docu_Control_Leader_Sign_Mech'),\
    col('Docu_Control_Admin_Sign_Mech'),\
    col('NCR#'),\
    col('Total#ofFolders'),\
    col('ReasonforECR0').alias('Reason_For_ECR')
)

#display(newdf)

StatementMeta(, 3626fdbf-447e-4f95-bd71-2822da4a3877, 13, Finished, Available, Finished)

Splitting the Products_Mech to projects and subprojects columns

In [12]:
#newdf = newdf.withColumn('ID', col('Products_Mech.__id'))
newdf = newdf.withColumn('Project', col('Products_Mech.Project'))
newdf = newdf.withColumn('Subproject', col('Products_Mech.Subproject'))
#display(newdf)

StatementMeta(, 3626fdbf-447e-4f95-bd71-2822da4a3877, 14, Finished, Available, Finished)

Formatting the Project and Subproject columns to identify all the projects and subprojects involved in the ECR

In [13]:
newdf = newdf.withColumn('Project',trim(newdf.Project))
newdf = newdf.withColumn('Project',split('Project','[/,&;]'))
newdf = newdf.withColumn('Project',explode('Project'))
newdf = newdf.withColumn('Subproject',trim(newdf.Subproject))
newdf = newdf.withColumn('Subproject',split('Subproject','[/,]'))
newdf = newdf.withColumn('Subproject',explode('Subproject'))
newdf = newdf.withColumn('Subproject',substring_index('Subproject',' (',1))
#display(newdf)

StatementMeta(, 3626fdbf-447e-4f95-bd71-2822da4a3877, 15, Finished, Available, Finished)

In [14]:
cleandf = newdf.select(
    col('ECR#'),\
    col('Urgency_Code_Mech'),\
    col('Products_Added'),\
    col('Drawing_Mech'),\
    col('Stage_Mech'),\
    col('Created'),\
    col('Modified'),\
    col('Project_Leader_Sign_Mech'),\
    col('ECR_Completed_Sign_Mech'),\
    col('Review_Project_Leader_Sign_Mech'),\
    col('Docu_Control_Leader_Sign_Mech'),\
    col('Docu_Control_Admin_Sign_Mech'),\
    col('NCR#'),\
    col('Total#ofFolders'),\
    col('Reason_For_ECR'),\
    col('Project'),\
    col('Subproject')
)

StatementMeta(, 3626fdbf-447e-4f95-bd71-2822da4a3877, 16, Finished, Available, Finished)

In [15]:
delta_table_path = "Tables/ECR_List_Clean" #fill in your delta table path 
cleandf.write.format("delta").mode("overwrite").option('overwriteSchema','true').save(delta_table_path)

StatementMeta(, 3626fdbf-447e-4f95-bd71-2822da4a3877, 17, Finished, Available, Finished)