In [None]:
from datetime import datetime
import pandas as pd
import numpy as np
import os
import json

In [None]:
datafile_path = "data.csv"
df = pd.read_csv(datafile_path)

In [20]:
# 1. Basic previews
head = df.head()
print(head)

       Order_ID Customer_ID        Date  Age Gender      City  \
0  ORD_000001-1  CUST_00001  2023-05-29   40   Male    Ankara   
1  ORD_000001-2  CUST_00001  2023-10-12   40   Male    Ankara   
2  ORD_000001-3  CUST_00001  2023-12-05   40   Male    Ankara   
3  ORD_000002-1  CUST_00002  2023-05-11   33   Male  Istanbul   
4  ORD_000002-2  CUST_00002  2023-06-16   33   Male  Istanbul   

  Product_Category  Unit_Price  Quantity  Discount_Amount  Total_Amount  \
0            Books       29.18         1             0.00         29.18   
1    Home & Garden      644.40         1           138.05        506.35   
2           Sports      332.82         5             0.00       1664.10   
3             Food       69.30         5            71.05        275.45   
4           Beauty      178.15         3             0.00        534.45   

   Payment_Method Device_Type  Session_Duration_Minutes  Pages_Viewed  \
0  Digital Wallet      Mobile                        14             9   
1     Credit

In [None]:
# 2. Schema & datatypes

print("SHAPE\n", f"rows: {df.shape[0]}, columns: {df.shape[1]}")
print("DTYPES\n", df.dtypes)
# print("COLUMN LIST\n", list(df.columns))

SHAPE
 rows: 17049, columns: 18
DTYPES
 Order_ID                     object
Customer_ID                  object
Date                         object
Age                           int64
Gender                       object
City                         object
Product_Category             object
Unit_Price                  float64
Quantity                      int64
Discount_Amount             float64
Total_Amount                float64
Payment_Method               object
Device_Type                  object
Session_Duration_Minutes      int64
Pages_Viewed                  int64
Is_Returning_Customer          bool
Delivery_Time_Days            int64
Customer_Rating               int64
dtype: object
COLUMN LIST
 ['Order_ID', 'Customer_ID', 'Date', 'Age', 'Gender', 'City', 'Product_Category', 'Unit_Price', 'Quantity', 'Discount_Amount', 'Total_Amount', 'Payment_Method', 'Device_Type', 'Session_Duration_Minutes', 'Pages_Viewed', 'Is_Returning_Customer', 'Delivery_Time_Days', 'Customer_Rating']


In [22]:
# 3) MISSING DATA (counts + percent) and "hidden" missing (empty strings)
missing_count = df.isna().sum()
missing_pct = (df.isna().mean() * 100).round(2)
missing_df = pd.concat([missing_count.rename("missing_count"), missing_pct.rename("missing_pct")], axis=1).sort_values("missing_pct", ascending=False)
empty_string_counts = (df == "").sum()
missing_df["empty_string_count"] = empty_string_counts
print("MISSING DATA SUMMARY\n", missing_df)


MISSING DATA SUMMARY
                           missing_count  missing_pct  empty_string_count
Order_ID                              0          0.0                   0
Customer_ID                           0          0.0                   0
Delivery_Time_Days                    0          0.0                   0
Is_Returning_Customer                 0          0.0                   0
Pages_Viewed                          0          0.0                   0
Session_Duration_Minutes              0          0.0                   0
Device_Type                           0          0.0                   0
Payment_Method                        0          0.0                   0
Total_Amount                          0          0.0                   0
Discount_Amount                       0          0.0                   0
Quantity                              0          0.0                   0
Unit_Price                            0          0.0                   0
Product_Category             

In [23]:
# 7) NUMERICAL COLUMNS: summary statistics and anomaly checks
num_cols = df.select_dtypes(include=["number"]).columns.tolist()
print("NUMERIC COLUMNS\n", num_cols or "<none>")
if num_cols:
    print("\nNUMERIC SUMMARY (describe)\n", df[num_cols].describe().T)
    anomalies = []
    for c in num_cols:
        nneg = int((df[c] < 0).sum())
        nzero = int((df[c] == 0).sum())
        anomalies.append((c, nneg, nzero))
    print("\nNEGATIVE / ZERO COUNTS\n", pd.DataFrame(anomalies, columns=["column","n_negatives","n_zeros"]).set_index("column"))


NUMERIC COLUMNS
 ['Age', 'Unit_Price', 'Quantity', 'Discount_Amount', 'Total_Amount', 'Session_Duration_Minutes', 'Pages_Viewed', 'Delivery_Time_Days', 'Customer_Rating']

NUMERIC SUMMARY (describe)
                             count         mean          std    min     25%  \
Age                       17049.0    34.945745    11.046855  18.00   26.00   
Unit_Price                17049.0   447.901689   722.319705   5.05   73.26   
Quantity                  17049.0     3.011379     1.417027   1.00    2.00   
Discount_Amount           17049.0    69.788135   240.704662   0.00    0.00   
Total_Amount              17049.0  1277.438711  2358.436375   6.21  172.97   
Session_Duration_Minutes  17049.0    14.535633     2.925524   4.00   13.00   
Pages_Viewed              17049.0     9.003109     2.259954   1.00    7.00   
Delivery_Time_Days        17049.0     6.503607     3.488787   1.00    4.00   
Customer_Rating           17049.0     3.899408     1.128803   1.00    3.00   

                   

In [24]:
# 8) CATEGORICAL COLUMNS: cardinality & top values
cat_cols = df.select_dtypes(include=["object","category"]).columns.tolist()
print("CATEGORICAL COLUMNS\n", cat_cols or "<none>")
cat_overview = {}
for c in cat_cols:
    vc = df[c].fillna("<<NA>>").value_counts(dropna=False)
    cat_overview[c] = {"n_unique": int(df[c].nunique(dropna=True)), "top_5": vc.head(5).to_dict()}
print("\nCATEGORICAL OVERVIEW (cardinality + top_5)\n", pd.DataFrame.from_dict(cat_overview, orient="index"))


CATEGORICAL COLUMNS
 ['Order_ID', 'Customer_ID', 'Date', 'Gender', 'City', 'Product_Category', 'Payment_Method', 'Device_Type']

CATEGORICAL OVERVIEW (cardinality + top_5)
                   n_unique                                              top_5
Order_ID             17049  {'ORD_000001-1': 1, 'ORD_003337-3': 1, 'ORD_00...
Customer_ID           5000  {'CUST_04224': 10, 'CUST_03833': 10, 'CUST_043...
Date                   450  {'2023-07-29': 59, '2023-02-14': 56, '2024-01-...
Gender                   3       {'Female': 8613, 'Male': 8176, 'Other': 260}
City                    10  {'Istanbul': 4402, 'Ankara': 2422, 'Izmir': 20...
Product_Category         8  {'Sports': 2248, 'Beauty': 2212, 'Books': 2206...
Payment_Method           5  {'Credit Card': 6801, 'Debit Card': 4321, 'Dig...
Device_Type              3  {'Mobile': 9543, 'Desktop': 5845, 'Tablet': 1661}


In [26]:
# 10) QUICK SUMMARY PRINT (human-friendly)
summary = {
    "rows": df.shape[0],
    "columns": df.shape[1],
    "duplicated_rows": int(df.duplicated().sum()),
    "numeric_columns": num_cols,
    "categorical_columns": cat_cols
}
print("QUICK SUMMARY\n", summary)


QUICK SUMMARY
 {'rows': 17049, 'columns': 18, 'duplicated_rows': 0, 'numeric_columns': ['Age', 'Unit_Price', 'Quantity', 'Discount_Amount', 'Total_Amount', 'Session_Duration_Minutes', 'Pages_Viewed', 'Delivery_Time_Days', 'Customer_Rating'], 'categorical_columns': ['Order_ID', 'Customer_ID', 'Date', 'Gender', 'City', 'Product_Category', 'Payment_Method', 'Device_Type']}
