#Create a Database

In [0]:
import re
userName = spark.sql("SELECT CURRENT_USER").collect()[0]['current_user()']
userName0 = userName.split("@")[0]
userName0 = re.sub('[!#$%&\'*+-/=?^`{}|\.]+', '_', userName0)
userName1 = userName.split("@")[1]
userName = f'{userName0}@{userName1}'
dbutils.fs.mkdirs(f"/Users/{userName}/data")
userDir = f"/Users/{userName}/data"
databaseName = f"{userName0}_FinalProject_01"

print('databaseName ' + databaseName)
print('UserDir ' + userDir)

spark.sql(f"DROP DATABASE IF EXISTS {databaseName} CASCADE")
spark.sql(f"CREATE DATABASE {databaseName}")
spark.sql(f"use {databaseName}")

print (f"Database {databaseName} successfully rebuilt.")

# Data Load from landingzone path to bronze layer

In [0]:
rootPath = "/mnt/g5/landingzone/"
all_items = dbutils.fs.ls(rootPath)
files_only = [item for item in all_items if item.isDir() == False]
files_only

In [0]:
for file in files_only:
  tableName = "bronze_" + file.name.replace('.csv', '')
  print (f"processing file {file.name} into table name {tableName}...")

  loadDf = spark.read.option("header", True).option("inferSchema", True).csv(file.path)
  loadDf.write.saveAsTable(tableName) #saves delta table
  
  print(f"Successfully saved delta table {tableName}.")
  print("")

#Exploratory Data Analysis

**Train Data Analysis :**
1) Whether the promotion increase the sales
2) Predicting the Sales using Oil Price 

In [0]:
train_df = spark.read.format("delta").table("bronze_train")
oil_df = spark.read.format("delta").table("bronze_oil_prices")

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
pandas_df = train_df.toPandas()

In [0]:
sns.pairplot(pandas_df)
plt.show()

In [0]:
from pyspark.sql.functions import col
from pyspark.sql.functions import when


In [0]:

# Convert string date column to date type
df1 = oil_df.withColumn("date", col("date").cast("date"))
df2 = train_df.withColumn("date", col("date").cast("date"))

# Joining DataFrames on the 'date' column
joined_df = df1.join(df2, on='date', how='inner')

# Display the joined DataFrame
joined_df.show()

In [0]:
# Get unique values from a column using distinct()
unique_values = joined_df.select("dcoilwtico").distinct()

# Show unique values
unique_values.show()

In [0]:
df = joined_df.withColumn("dcoilwtico", when(joined_df["dcoilwtico"].isNull(), 0).otherwise(joined_df["dcoilwtico"]))


In [0]:
df.show()

In [0]:

# Collect the data for the two columns
data_to_plot = df.select('sales', 'dcoilwtico').collect()

# Extract values for plotting
x = [row['dcoilwtico'] for row in data_to_plot]
y = [row['sales'] for row in data_to_plot]
fig = plt.gcf()
fig.set_size_inches(4, 3)
# Plotting a scatter plot using Matplotlib
plt.scatter(x, y)
plt.xlabel('Oil Price')
plt.ylabel('Sales')
plt.title('Sales VS Oil Price')
plt.show()

In [0]:
df.describe()

In [0]:
# Collect the data for the two columns
data_to_plot = df.select('sales', 'onpromotion').collect()
# Extract values for plotting
x = [row['onpromotion'] for row in data_to_plot]
y = [row['sales'] for row in data_to_plot]
# Plotting a scatter plot using Matplotlib
fig = plt.gcf()
fig.set_size_inches(4, 3)
plt.scatter(x, y)
plt.xlabel('onpromotion')
plt.ylabel('sales')
plt.title('Sales VS OnPromotion')
plt.show()