In [1]:
import pandas as pd
from sklearn.datasets import make_classification
from collections import Counter

# Generate a synthetic imbalanced dataset
# weights=[0.99, 0.01] means 99% of data for class 0, and 1% for class 1
X, y = make_classification(n_samples=1000, 
                           n_features=10, 
                           n_informative=5,
                           n_redundant=0,
                           n_classes=2,
                           weights=[0.99, 0.01],
                           flip_y=0, # No noise
                           random_state=42)

# Summarize the class distribution
print(f"Original dataset shape: {X.shape}")
print(f"Class distribution: {Counter(y)}")
# Expected Output: Class distribution: Counter({0: 990, 1: 10})

# Optional: Create a Pandas DataFrame for easier viewing
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
df['target'] = y

print("\nFirst 5 rows of the generated DataFrame:")
print(df.head())

Original dataset shape: (1000, 10)
Class distribution: Counter({np.int64(0): 990, np.int64(1): 10})

First 5 rows of the generated DataFrame:
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0  -0.439693  -1.908048   1.545238   1.691835   2.839496   1.316750   
1  -1.113708   1.786755  -0.633453   0.684842   1.018074   1.576137   
2   1.090499   1.345019   1.486396   0.774657   2.246350   0.868652   
3  -1.979928   0.246730  -1.101155  -0.616849   0.601870   0.514434   
4   1.670760  -0.054336  -0.737556  -1.213370   0.659128  -0.671084   

   feature_6  feature_7  feature_8  feature_9  target  
0  -2.791435   1.193769   0.024971  -1.600737       0  
1  -1.069809  -1.607914  -0.262907   1.196298       0  
2  -0.442904   0.361173   1.131458  -1.113746       0  
3  -1.387988  -1.187894   0.396968   1.556104       0  
4   0.100017   2.323275  -0.610868  -0.257409       1  
