In [5]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, precision_recall_curve, roc_curve
from synthetic_generator import SyntheticRareEventGenerator

# Set a consistent look
sns.set(style="whitegrid")

# Create export folder if it doesn't exist
os.makedirs("./synthetic_datasets", exist_ok=True)

# Initialize generator
gen = SyntheticRareEventGenerator(
    n_samples=5000,
    n_features=10,
    n_informative=3,
    n_redundant=2,
    class_sep=1.0,
    weights=[0.95, 0.05],
    flip_y=0.01,
    random_state=42
)

# Generate dataset
df = gen.generate()
df = gen.add_noise(df, noise_level=0.1)

# Optional: inject drift for experiment
df_drifted = gen.inject_drift(df, drift_strength=0.3)

# Save CSV
file_name = "synthetic_drift0.3_noise0.1_seed42.csv"
df_drifted.to_csv(f"./synthetic_datasets/{file_name}", index=False)
print(f"Saved dataset to ./synthetic_datasets/{file_name}")

# Visualize class balance
sns.countplot(data=df_drifted, x='target')
plt.title("Class Distribution")
plt.show()

# Optional PCA visualization
try:
    from sklearn.decomposition import PCA
    pca = PCA(n_components=2)
    components = pca.fit_transform(df_drifted.drop('target', axis=1))
    plt.figure(figsize=(6, 4))
    sns.scatterplot(x=components[:,0], y=components[:,1], hue=df_drifted['target'], alpha=0.6)
    plt.title("PCA Visualization of Feature Space")
    plt.show()
except:
    print("PCA failed (possibly not enough variance)")

# Train/Test Split
X = df_drifted.drop('target', axis=1)
y = df_drifted['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# Train baseline model
clf = LogisticRegression(max_iter=1000, class_weight="balanced")
clf.fit(X_train, y_train)

# Predict & evaluate
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.3f}")
print(f"PR AUC: {average_precision_score(y_test, y_prob):.3f}")

# Plot PR Curve
precision, recall, _ = precision_recall_curve(y_test, y_prob)
plt.plot(recall, precision, label="PR Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.show()

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label="ROC Curve")
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


ImportError: cannot import name 'SyntheticRareEventGenerator' from partially initialized module 'SyntheticDataGenerator' (most likely due to a circular import) (/Users/sophiaboettcher/Param_IndianMutualFunds/A3_FeatureEngineering/training_data/SyntheticDataGenerator.py)