# Data loading, Preprocess and EDA
---
---

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Logistiq Prediction") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()


ModuleNotFoundError: No module named 'pandas'

## Load data with spark

In [None]:
df = spark.read.csv("../data/raw/data.csv", header=True, inferSchema=True)

## Analyse Exploratoire des Donn√©es (EDA)

In [None]:
df.printSchema()
df.show(5)
print(f"Number of rows: {df.count()}")
df.describe().show()

* Features Selection

In [None]:
df_selected = df.select(
    # Target
    "Late_delivery_risk",
    
    # Order and shipment details
    "Order Id",
    "order date (DateOrders)",
    "shipping date (DateOrders)",
    "Order Status",
    "Delivery Status",
    "Days for shipping (real)",
    "Days for shipment (scheduled)",
    "Shipping Mode",
    
    # Sales & financials
    "Sales",
    "Sales per customer",
    "Benefit per order",
    "Order Profit Per Order",
    "Order Item Quantity",
    "Order Item Discount",
    "Order Item Discount Rate",
    "Order Item Product Price",
    "Order Item Profit Ratio",
    "Order Item Total",
    
    # Product details
    "Product Card Id",
    "Product Name",
    "Product Price",
    "Product Category Id",
    "Product Card Id",
    "Product Status",
    "Category Name",
    "Department Name",
    
    # Customer info (excluding private fields)
    "Customer Id",
    "Customer Fname",
    "Customer Lname",
    "Customer Segment",
    "Customer City",
    "Customer State",
    "Customer Country",
    "Customer Zipcode",
    
    # Order region info
    "Order City",
    "Order State",
    "Order Country",
    "Order Region",
    "Order Zipcode",
    
    # Geolocation
    "Latitude",
    "Longitude",
    
    # Market context
    "Market"
)

* Check for missing values

In [None]:
from pyspark.sql.functions import col, isnan, when, count

df_selected.select([count(when(col(c).isNull(), c)).alias(c) for c in df_selected.columns]).show()

* Counts for categorical features,

In [None]:
# Define expected categorical columns (can include columns not present in df_selected)
categorical_cols = [
    "Type",
    "Delivery Status",
    "Order Status",
    "Shipping Mode",
    "Customer Segment",
    "Market",
    "Department Name",
    "Category Name",
    "Order Region",
    "Order Country",
    "Order City"
]

# Filter to columns that actually exist in df_selected to avoid AnalysisException
present_categorical_cols = [c for c in categorical_cols if c in df_selected.columns]
missing = [c for c in categorical_cols if c not in present_categorical_cols]
if missing:
    print(f"Warning: these categorical columns are not present in df_selected: {', '.join(missing)}")

# Perform group counts only on present columns
for col_name in present_categorical_cols:
    df_selected.groupBy(col_name).count().orderBy("count", ascending=False).show(5)

* Stats for numeric columns

In [None]:
numeric_cols = [
    "Days for shipping (real)",
    "Days for shipment (scheduled)",
    "Sales per customer",
    "Order Item Quantity",
    "Order Item Discount",
    "Order Item Discount Rate",
    "Order Item Product Price",
    "Order Item Total",
    "Order Item Profit Ratio",
    "Sales",
    "Benefit per order",
    "Order Profit Per Order",
    "Product Price",
    "Latitude",
    "Longitude",
    "Customer Zipcode",
    "Order Zipcode"
]

df_selected.describe(numeric_cols).show()

* Histogram for numeric columns

In [None]:
import matplotlib.pyplot as plt

df_selected.select(numeric_cols).toPandas().hist(bins=20, figsize=(15,10))
plt.tight_layout()
plt.show()

* Explore correlations

In [None]:
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import DoubleType
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features")
df_vector = assembler.transform(df_selected.select([col(c).cast(DoubleType()) for c in numeric_cols]))

corr_matrix = Correlation.corr(df_vector, "features").head()[0].toArray()

corr_df = pd.DataFrame(corr_matrix, index=numeric_cols, columns=numeric_cols)

plt.figure(figsize=(8, 6)) 
sns.heatmap(corr_df, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix Heatmap')
plt.show()