## Notebook Sample


In [2]:
%%pyspark
data_path = spark.read.load('abfss://hpi@ag83stdev00006.dfs.core.windows.net/Campaign_Analytics/hpi.Campaign_Analytics.csv'
, format='csv'
, header=True
)
display(data_path.limit(10))

## Load into Pandas and Perform Cleansing Operations


In [3]:
%%pyspark
from pyspark.sql.functions import *
from pyspark.sql.types import *

import numpy as np

pd_df = data_path.select("*").toPandas()

# 1. Remove '$' symbol and convert datatype to float
pd_df['Revenue']= pd_df['Revenue'].replace('[\$,]', '', regex=True).astype(float)
pd_df['Revenue_Target']= pd_df['Revenue_Target'].replace('[\$,]', '', regex=True).astype(float)

# 2. Replace null values with 0
pd_df['Revenue'].fillna(value=0, inplace=True)
pd_df['Revenue_Target'].fillna(value=0, inplace=True)

# 3. Convert columns to Camel Case
pd_df['Region'] = pd_df.Region.str.title()
pd_df['Country'] = pd_df.Country.str.title()
pd_df['Campaign_Name'] = pd_df.Campaign_Name.str.title()

## Data Transformation - Calculate Revenue Variance


In [4]:
#Create new column
pd_df['Revenue_Variance'] = pd_df['Revenue_Target'] - pd_df['Revenue']

print(pd_df[1:5])

## Move data to Azure Data Lake Gen2


In [18]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

In [5]:
%%pyspark
df = spark.createDataFrame(pd_df)
df.show(5)

(df
.coalesce(1)
.write.mode("overwrite")
.option("header", "true")
.format("com.databricks.spark.csv")
.save('abfss://hpi@ag83stdev00006.dfs.core.windows.net/Campaign_Analytics_Result/hpi.Campaign_Analytics.csv')
)

### Retrieve Result from Azure Data Lake Gen2

In [7]:
%%pyspark
df = spark.read.load('abfss://hpi@ag83stdev00006.dfs.core.windows.net/Campaign_Analytics_Result/hpi.Campaign_Analytics.csv', format='csv'
, header=True
)
display(df.limit(10))