In [5]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

In [6]:
# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 1000

# Generate synthetic data
data = {
    # Numerical - Integer
    'customer_id': np.arange(1, n_samples + 1),
    'age': np.random.randint(18, 90, n_samples),

    # Numerical - Float
    'average_purchase': np.random.normal(100, 25, n_samples).round(2),
    'loyalty_score': np.random.uniform(0, 10, n_samples).round(1),

    # Categorical - Nominal
    'preferred_color': np.random.choice(['Red', 'Blue', 'Green', 'Yellow', 'Black'], n_samples),

    # Categorical - Ordinal
    'membership_level': pd.Categorical(
        np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], n_samples),
        categories=['Bronze', 'Silver', 'Gold', 'Platinum'],
        ordered=True
    ),

    # Binary - Logical
    'is_active': np.random.choice([True, False], n_samples),

    # Binary - Numeric
    'has_subscription': np.random.choice([0, 1], n_samples),

    # Binary - String
    'marketing_consent': np.random.choice(['Yes', 'No'], n_samples),

    # Text - Raw Text
    'last_feedback': [
        f"Customer feedback {i}: " + np.random.choice([
            "Great service!",
            "Could be better",
            "Excellent products",
            "Need improvement"
        ]) for i in range(n_samples)
    ],

    # Time and Date - Timestamp
    'registration_date': [
        datetime.now() - timedelta(days=np.random.randint(0, 1000))
        for _ in range(n_samples)
    ],

    # Time and Date - Duration
    'membership_duration_days': np.random.uniform(0, 1000, n_samples).round(1),

    # Annotation - Metadata
    'tags': [
        {
            'interests': np.random.choice(['Sports', 'Technology', 'Fashion', 'Food'], 2).tolist(),
            'source': np.random.choice(['Web', 'Mobile', 'Store']),
            'priority': np.random.choice(['High', 'Medium', 'Low'])
        }
        for _ in range(n_samples)
    ]
}

# Create DataFrame
df = pd.DataFrame(data)

# Set appropriate data types
df['preferred_color'] = df['preferred_color'].astype('category')
df['membership_level'] = df['membership_level'].astype('category')
df['has_subscription'] = df['has_subscription'].astype('int8')
df['marketing_consent'] = df['marketing_consent'].astype('string')
df['last_feedback'] = df['last_feedback'].astype('string')
df['registration_date'] = pd.to_datetime(df['registration_date'])

# Display the first few rows and data info
print("\nDataFrame Info:")
print(df.info())
print("\nFirst few rows:")
print(df.head())


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   customer_id               1000 non-null   int64         
 1   age                       1000 non-null   int64         
 2   average_purchase          1000 non-null   float64       
 3   loyalty_score             1000 non-null   float64       
 4   preferred_color           1000 non-null   category      
 5   membership_level          1000 non-null   category      
 6   is_active                 1000 non-null   bool          
 7   has_subscription          1000 non-null   int8          
 8   marketing_consent         1000 non-null   string        
 9   last_feedback             1000 non-null   string        
 10  registration_date         1000 non-null   datetime64[ns]
 11  membership_duration_days  1000 non-null   float64       
 12  tags

In [7]:
df.to_csv("data.csv", index=False)