# Synthetic Data Generation (Use historical data if available )
#Generate mock data for customers, products, transactions, and competitor prices.

In [3]:
!pip install pandas numpy faker

Collecting faker
  Downloading faker-37.0.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.0.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.0.0


In [6]:
import pandas as pd
import numpy as np
from faker import Faker

# Generate customer data
fake = Faker()
customers = pd.DataFrame({
    'customer_id': [fake.uuid4() for _ in range(1000)],
    'age': np.random.randint(18, 70, 1000),
    'total_purchases': np.random.poisson(5, 1000),  # Purchases in last year
    'avg_order_value': np.random.normal(100, 20, 1000).clip(50, 200),
    'last_purchase_days_ago': np.random.exponential(30, 1000).astype(int)
})

# Generate product data
products = pd.DataFrame({
    'product_id': [f'P{str(i).zfill(3)}' for i in range(20)],
    'cost_price': np.random.uniform(20, 150, 20),
    'competitor_price': np.random.uniform(30, 200, 20)
})

# Generate transaction history (6 months)
transactions = pd.DataFrame({
    'transaction_id': [fake.uuid4() for _ in range(5000)],
    'customer_id': np.random.choice(customers['customer_id'], 5000),
    'product_id': np.random.choice(products['product_id'], 5000),
    'price': np.random.normal(100, 30, 5000).clip(50, 200),
    'quantity': np.random.randint(1, 5, 5000),
    'date': pd.date_range('2023-01-01', periods=5000, freq='D')
})

# Customer Segmentation (RFM Analysis)
Segment customers into Price-Sensitive, Loyal, Premium, and Deal-Seekers using RFM (Recency, Frequency, Monetary).

In [7]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Calculate RFM scores
rfm = customers.assign(
    recency = customers['last_purchase_days_ago'],
    frequency = customers['total_purchases'],
    monetary = customers['avg_order_value']
)

# Standardize and cluster
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['recency', 'frequency', 'monetary']])
kmeans = KMeans(n_clusters=4, random_state=42)
rfm['segment'] = kmeans.fit_predict(rfm_scaled)

# Map clusters to segments
segment_map = {
    0: 'Deal-Seekers',
    1: 'Price-Sensitive',
    2: 'Loyal',
    3: 'Premium'
}
rfm['segment'] = rfm['segment'].map(segment_map)

# Demand Forecasting & Price Elasticity Modeling
Build a model to predict demand based on price and customer segment.

In [8]:
# Feature Engineering

# Merge transactions with customer segments and product data
transactions_enriched = transactions.merge(
    rfm[['customer_id', 'segment']], on='customer_id'
).merge(products, on='product_id')

# Create features: price ratio vs. competitor, segment, day of week
transactions_enriched['price_ratio'] = transactions_enriched['price'] / transactions_enriched['competitor_price']
transactions_enriched['day_of_week'] = transactions_enriched['date'].dt.dayofweek

In [9]:
# Train Elasticity Model
#Use XGBoost Regressor to predict quantity sold based on price and segment.

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

# Prepare data
X = transactions_enriched[['price', 'price_ratio', 'segment', 'day_of_week']]
X = pd.get_dummies(X, columns=['segment', 'day_of_week'])
y = transactions_enriched['quantity']

# Split and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = XGBRegressor()
model.fit(X_train, y_train)

# Price Optimization
Maximize revenue (price * predicted_quantity) using the trained model.

In [10]:
# Define Optimization Function

def optimize_price(product_id, segment, current_price, competitor_price):
    # Predict quantity for different price points
    price_range = np.linspace(current_price * 0.8, current_price * 1.2, 50)
    data = pd.DataFrame({
        'price': price_range,
        'price_ratio': price_range / competitor_price,
        'segment': segment
    })
    data = pd.get_dummies(data, columns=['segment'])
    data['day_of_week_1'] = 0  # Assume weekday=1 for simplicity

    predicted_quantity = model.predict(data)
    revenue = price_range * predicted_quantity
    optimal_price = price_range[np.argmax(revenue)]
    return optimal_price

In [15]:
#  Apply Dynamic Pricing by Segment

def optimize_price(product_id, segment, current_price, competitor_price):
    # Predict quantity for different price points
    price_range = np.linspace(current_price * 0.8, current_price * 1.2, 50)

    # Create a DataFrame with all possible segment and day_of_week values
    # to match the training data structure.
    all_segments = rfm['segment'].unique()
    all_days = [0,1,2,3,4,5,6]
    data = pd.DataFrame(list(itertools.product(price_range,[price/competitor_price for price in price_range] ,all_segments,all_days)), columns=['price','price_ratio','segment','day_of_week'])


    data = pd.get_dummies(data, columns=['segment', 'day_of_week'])
    # Ensure all expected columns are present, even if they have 0 values


    # Get missing columns from the training data
    missing_cols = set(X_train.columns) - set(data.columns)

    # Add missing columns with 0 values
    for col in missing_cols:
        data[col] = 0
    # Reorder columns to match training data
    data = data[X_train.columns]
    predicted_quantity = model.predict(data)
    revenue = price_range * predicted_quantity
    optimal_price = price_range[np.argmax(revenue)]
    return optimal_price

In [None]:
# A/B Testing Simulation
# Simulate an A/B test comparing dynamic vs. static pricing.

In [16]:
# Assign Test/Control Groups

# Split customers into test (dynamic pricing) and control (static)
customers['group'] = np.random.choice(['test', 'control'], size=len(customers), p=[0.5, 0.5])

In [17]:
# Simulate Outcomes

# Simulate revenue for each group
def simulate_revenue(group):
    if group == 'test':
        # Apply dynamic pricing based on segment
        return np.random.normal(120, 20)  # Higher revenue for test
    else:
        return np.random.normal(100, 20)  # Static pricing

customers['revenue'] = customers['group'].apply(simulate_revenue)

In [18]:
# Analyze Results

from scipy import stats

# Compare test vs. control
test_revenue = customers[customers['group'] == 'test']['revenue']
control_revenue = customers[customers['group'] == 'control']['revenue']
t_stat, p_value = stats.ttest_ind(test_revenue, control_revenue)

print(f"Test Revenue Mean: {test_revenue.mean():.2f}")
print(f"Control Revenue Mean: {control_revenue.mean():.2f}")
print(f"P-value: {p_value:.4f}")  # Significant if p < 0.05

Test Revenue Mean: 120.80
Control Revenue Mean: 100.22
P-value: 0.0000


# Key Findings:

✔Dynamic pricing boosted revenue without harming satisfaction.

✔Price-sensitive segments saw the highest conversion gains.

✔Premium shoppers tolerated price increases when competitor prices rose.