In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp  # Kolmogorov–Smirnov test

# Load dataset
df = pd.read_csv('/mnt/data/file-QwcBCUNP3LXzYF4vj41uQq')

# Replace with actual column names
numeric_column = 'your_numeric_column'  # e.g., 'sales'
time_column = 'your_time_column'        # e.g., 'date'

# Convert time column to datetime
df[time_column] = pd.to_datetime(df[time_column])

# Split into two time periods (modify as needed)
period1 = df[df[time_column] < '2023-01-01'][numeric_column].dropna()
period2 = df[df[time_column] >= '2023-01-01'][numeric_column].dropna()

# Descriptive statistics
print("Period 1 Stats:\n", period1.describe())
print("Period 2 Stats:\n", period2.describe())

# Kolmogorov–Smirnov Test (tests if two samples come from the same distribution)
ks_stat, ks_p = ks_2samp(period1, period2)
print(f"KS Test Statistic: {ks_stat:.4f}, P-value: {ks_p:.4f}")

# Plot histograms
plt.figure(figsize=(10, 5))
plt.hist(period1, bins=20, alpha=0.5, label='Period 1', color='blue', edgecolor='black')
plt.hist(period2, bins=20, alpha=0.5, label='Period 2', color='red', edgecolor='black')
plt.title(f'Data Drift Comparison for {numeric_column}')
plt.xlabel(numeric_column)
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.show()