In [None]:
import pandas as pd
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt

# Load your dataset
df = pd.read_csv('/mnt/data/file-554GcGGZqFY7nhgpXEWuPj')

# Preview the dataframe to understand structure
print(df.head())

# Assume the dataframe has columns: 'timestamp' and 'numeric_attribute'
# Convert timestamp to datetime if needed
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Define two time periods for comparison
# Adjust these dates based on your data
period_1 = df[df['timestamp'] < '2023-01-01']
period_2 = df[df['timestamp'] >= '2023-01-01']

# Extract numeric attribute values
attr_1 = period_1['numeric_attribute'].dropna()
attr_2 = period_2['numeric_attribute'].dropna()

# Perform KS Test
ks_stat, p_value = ks_2samp(attr_1, attr_2)

# Output the result
print(f"KS Statistic: {ks_stat}")
print(f"P-value: {p_value}")

if p_value < 0.05:
    print("Significant data drift detected.")
else:
    print("No significant data drift detected.")

# Optional: Plot distributions
plt.hist(attr_1, bins=30, alpha=0.5, label='Period 1')
plt.hist(attr_2, bins=30, alpha=0.5, label='Period 2')
plt.legend()
plt.title('Distribution Comparison')
plt.show()