In [0]:
from pyspark.sql.types import DoubleType

def basic_eda(df):
    print("Schema:")
    df.printSchema()
    
    print("First 5 rows:")
    display(df.limit(5))
    
    row_count = df.count()
    print(f"Number of rows: {row_count}")
    
    # Select only columns with DoubleType for statistics
    double_cols = [
        field.name
        for field in df.schema.fields
        if isinstance(field.dataType, DoubleType)
    ]
    if double_cols:
        print("Summary statistics for double columns:")
        display(df.select(double_cols).describe())
    else:
        print("No double type columns found.")
    
    print("Null values in each column:")
    null_counts = df.select([
        col(c).isNull().cast("int").alias(c) for c in df.columns
    ]).groupBy().sum()
    display(null_counts)
    
    print("Duplicate rows count:")
    duplicate_rows_count = df.count() - df.dropDuplicates().count()
    print(duplicate_rows_count)

In [0]:
from pyspark.sql.functions import col, when, split

def fill_missing_marca(df):
    # Extract the first word from the nome_produto column
    df = df.withColumn("first_word", split(col("nome_produto"), " ").getItem(0))
    
    # Fill missing values in the marca column with the first word from nome_produto
    df = df.withColumn("marca", when(col("marca").isNull(), col("first_word")).otherwise(col("marca")))
    
    # Drop the temporary first_word column
    df = df.drop("first_word")
    
    return df

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
# Example usage
df_clientes = spark.read.format("delta").table("workspace.default.clientes")
basic_eda(df_clientes)

In [0]:
# Example usage
df_produtos = spark.read.format("delta").table("workspace.default.produtos")
basic_eda(df_produtos)

In [0]:
# Example usage
df_vendas = spark.read.format("delta").table("workspace.default.vendas")
basic_eda(df_vendas)

In [0]:
# Example usage
df_produtos = spark.read.format("delta").table("workspace.default.produtos")
df_produtos = fill_missing_marca(df_produtos)
display(df_produtos)

In [0]:
df_joined = df_vendas.join(df_clientes, "id_cliente", "inner").join(df_produtos, "id_produto", "inner")
df_joined = df_joined.select('id_venda', 'canal_venda', 'valor', 'data_venda', 'id_produto', 'nome_produto', 'descricao', 'marca', 'id_cliente', 'nome', 'email', 'telefone', 'cidade', 'estado', 'data_cadastro')

display(df_joined)

In [0]:
basic_eda(df_joined)

In [0]:
pdf = df_clientes.toPandas()
num_cols = pdf.select_dtypes(include=['float64', 'int64']).columns
print(num_cols)

In [0]:
def graphical_eda(df, num_cols=None, cat_cols=None):
    # Convert Spark DataFrame to Pandas DataFrame for plotting
    pdf = df.toPandas()
    
    # Plot 1: Distribution of numerical columns
    if num_cols is None:
        num_cols = ['valor']
    for col in num_cols:
        plt.figure(figsize=(15, 10))
        plt.rcParams.update({'font.size': 12})
        sns.histplot(pdf[col].dropna(), kde=True)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.show()
    
    # Plot 2: Count plot for categorical columns
    if cat_cols is None:
        cat_cols = ['canal_venda', 'marca', 'estado']
    for col in cat_cols:
        plt.figure(figsize=(15, 20))
        plt.rcParams.update({'font.size': 12})
        sns.countplot(y=pdf[col].dropna(), order=pdf[col].value_counts().index)
        plt.title(f'Count plot of {col}')
        plt.xlabel('Count')
        plt.ylabel(col)
        plt.show()
    
    # Plot 3: Correlation heatmap for numerical columns
    num_cols_ttl = ['id_venda', 'canal_venda', 'valor', 'data_venda', 'id_produto', 'nome_produto', 'descricao', 'marca', 'id_cliente', 'nome', 'email', 'telefone', 'cidade', 'estado', 'data_cadastro']
    # pdf.select_dtypes(include=['float64', 'int64']).columns
    if len(num_cols_ttl) > 1:
        plt.figure(figsize=(10, 8))
        plt.rcParams.update({'font.size': 12})
        corr = pdf[num_cols_ttl].corr()
        sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
        plt.title('Correlation Heatmap')
        plt.show()

# Example usage
graphical_eda(df_joined, num_cols=['valor'], cat_cols=['canal_venda', 'marca', 'estado'])

In [0]:
from pyspark.sql.functions import count, sum, avg, col, round

total_count = df_joined.count()
total_sum_valor = df_joined.agg(sum('valor')).collect()[0][0]

df_prod_agg = df_joined.groupBy('nome_produto').agg(
    count('id_venda').alias('count_vendas'),
    round(sum('valor'), 2).alias('sum_valor'),
    round(avg('valor'), 2).alias('avg_valor')
    ).withColumn(
        'percent_count_vendas', round((col('count_vendas') / total_count) * 100, 2)
    ).withColumn(
        'percent_sum_valor', round((col('sum_valor') / total_sum_valor) * 100, 2)
    )
    
    
df_prod_agg = df_prod_agg.select('nome_produto','count_vendas','percent_count_vendas','sum_valor','percent_sum_valor','avg_valor').orderBy('count_vendas', ascending=False)

df_prod_agg.display()

In [0]:
from pyspark.sql.functions import sum, count, col, round, avg, countDistinct

total_count = df_joined.agg(countDistinct('id_venda')).collect()[0][0]
total_sum_valor = df_joined.agg(sum('valor')).collect()[0][0]

df_canal_agg = df_joined.groupBy('canal_venda').agg(
    count('id_venda').alias('count_vendas'),
    round(sum('valor'), 2).alias('sum_valor'),
    round(avg('valor'), 2).alias('avg_valor')
).withColumn(
    'percent_count_vendas', round((col('count_vendas') / total_count) * 100, 2)
).withColumn(
    'percent_sum_valor', round((col('sum_valor') / total_sum_valor) * 100, 2)
)

df_canal_agg = df_canal_agg.select(
    'canal_venda', 'count_vendas', 'percent_count_vendas', 
    'sum_valor', 'percent_sum_valor', 'avg_valor'
).orderBy('count_vendas', ascending=False)

df_canal_agg.display()

In [0]:
from pyspark.sql.functions import month, year, count, sum, avg, col, round

# Extract year and month from data_venda
df_seasonal = df_joined.withColumn("ano", year(col("data_venda"))) \
    .withColumn("mes", month(col("data_venda")))

# Calculate total counts and sums for percent calculations
total_count = df_seasonal.count()
total_sum_valor = df_seasonal.agg(sum('valor')).collect()[0][0]

# Aggregate by year and month
df_sazonal_agg = df_seasonal.groupBy("ano", "mes").agg(
    count("id_venda").alias("count_vendas"),
    round(sum("valor"), 2).alias("sum_valor"),
    round(avg("valor"), 2).alias("avg_valor")
).withColumn(
    "percent_count_vendas", round((col("count_vendas") / total_count) * 100, 2)
).withColumn(
    "percent_sum_valor", round((col("sum_valor") / total_sum_valor) * 100, 2)
)

df_sazonal_agg = df_sazonal_agg.select(
    "ano", "mes", "count_vendas", "percent_count_vendas",
    "sum_valor", "percent_sum_valor", "avg_valor"
).orderBy("ano", "mes")

display(df_sazonal_agg)

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert Spark DataFrame to Pandas DataFrame for plotting
pdf_sazonal_agg = df_sazonal_agg.toPandas()

# Plotting the seasonal aggregation data as a bar plot
plt.figure(figsize=(15, 10))
sns.barplot(data=pdf_sazonal_agg, x='mes', y='sum_valor', hue='ano')
plt.title('Monthly Sales Value by Year')
plt.xlabel('Month')
plt.ylabel('Total Sales Value')
plt.legend(title='Year')
plt.grid(True)
plt.show()

In [0]:
from pyspark.sql.functions import month, year, col, sum
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd
from datetime import datetime

# Extract year and month from data_venda
df_seasonal = df_joined.withColumn("ano", year(col("data_venda"))) \
    .withColumn("mes", month(col("data_venda")))

# Aggregate by year and month
df_sazonal_agg = df_seasonal.groupBy("ano", "mes").agg(
    sum("valor").alias("sum_valor")
).orderBy("ano", "mes")

# Prepare features and label
assembler = VectorAssembler(
    inputCols=["ano", "mes"],
    outputCol="features"
)
df_features = assembler.transform(df_sazonal_agg).select("features", "sum_valor")

# Split the data into training and test sets
train_data, test_data = df_features.randomSplit([0.8, 0.2], seed=42)

# Create and train the linear regression model
lr = LinearRegression(featuresCol="features", labelCol="sum_valor")
lr_model = lr.fit(train_data)

# Make predictions on the test set
predictions = lr_model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(
    labelCol="sum_valor",
    predictionCol="prediction",
    metricName="rmse"
)
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")

# Display predictions
display(predictions)

# Predict future sales for the next quarter
current_year = datetime.now().year
current_month = datetime.now().month
future_dates = [(current_year, current_month + i) for i in range(1, 4)]
future_df = pd.DataFrame(future_dates, columns=["ano", "mes"])

# Convert to Spark DataFrame using the existing Spark context
future_spark_df = spark.createDataFrame(future_df)

# Prepare features for prediction
future_features = assembler.transform(future_spark_df).select("features")

# Make predictions for the next quarter
future_predictions = lr_model.transform(future_features)

# Display future predictions
display(future_predictions)