In [1]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker
fake = Faker()
np.random.seed(42)
Faker.seed(42)

# Parameters
num_customers = 150  # Will create about 25% cardholders in 1000 transactions
num_transactions = 1000

# Generate customer base (only cardholders)
customers = []
for _ in range(num_customers):
    gender = np.random.choice(['Женский', 'Мужской'], p=[0.6, 0.4])
    first_name = fake.first_name_female() if gender == 'Женский' else fake.first_name_male()
    last_name = fake.last_name()
    
    customers.append({
        'card_code': 2719000000000 + _,
        'full_name': f"{first_name} {last_name}",
        'phone': '0' + ''.join(np.random.choice(list('123456789'), size=8)),
        'issue_date': fake.date_between(start_date='-4y', end_date='-3m'),
        'date_of_birth': fake.date_of_birth(minimum_age=18, maximum_age=85),
        'gender': gender,
        'address': f"г. {fake.city()}, ул. {fake.street_name()}, д. {fake.building_number()}" if np.random.random() < 0.2 else None
    })

# Generate transactions
transactions = []
stores = ['Շենգավիթ', 'Քանաքեռ', 'Կենտրոն', 'Աջափնյակ', 'Ավան', 'Նոր Նորք', 'Էրեբունի']

for _ in range(num_transactions):
    # 25% chance of no card (matches your original data)
    if np.random.random() < 0.25:
        transactions.append({
            'date': fake.date_time_between(start_date='-2y', end_date='now').strftime('%d.%m.%Y %H:%M:%S'),
            'discount_card': None,
            'store': np.random.choice(stores),
            'product_name': None,
            'card_code': None,
            'customer_address': None,
            'customer_phone': None,
            'issue_date': None,
            'date_of_birth': None,
            'gender': None,
            'transaction_amount': round(np.random.gamma(shape=1.5, scale=800), 2)
        })
    else:
        customer = np.random.choice(customers)
        transactions.append({
            'date': fake.date_time_between(start_date='-2y', end_date='now').strftime('%d.%m.%Y %H:%M:%S'),
            'discount_card': f"{customer['card_code']} ({customer['full_name']})",
            'store': np.random.choice(stores),
            'product_name': None,  # As per your original structure
            'card_code': customer['card_code'],
            'customer_address': customer['address'] if np.random.random() < 0.1 else None,
            'customer_phone': customer['phone'],
            'issue_date': customer['issue_date'].strftime('%d.%m.%Y'),
            'date_of_birth': customer['date_of_birth'].strftime('%d.%m.%Y') if np.random.random() < 0.8 else None,
            'gender': customer['gender'],
            'transaction_amount': round(np.random.gamma(shape=2, scale=600), 2)
        })

# Create DataFrame
df = pd.DataFrame(transactions)

# Verify null percentages match original
print("Null percentages:")
print(df.isnull().mean())

# Save to CSV
df.to_csv('synthetic_retail_data.csv', index=False, encoding='utf-8-sig')

print("\nSample data:")
print(df.head())

Null percentages:
date                  0.000
discount_card         0.258
store                 0.000
product_name          1.000
card_code             0.258
customer_address      0.992
customer_phone        0.258
issue_date            0.258
date_of_birth         0.409
gender                0.258
transaction_amount    0.000
dtype: float64

Sample data:
                  date                   discount_card     store product_name  \
0  25.12.2023 13:08:23  2719000000078 (Carolyn Miller)   Քանաքեռ         None   
1  21.09.2024 16:22:27     2719000000098 (Tommy Evans)   Քանաքեռ         None   
2  10.07.2023 10:03:48                            None  Նոր Նորք         None   
3  14.02.2025 09:29:02      2719000000059 (Adam Jones)      Ավան         None   
4  23.08.2023 02:10:22   2719000000069 (Margaret Ross)  Շենգավիթ         None   

      card_code customer_address customer_phone  issue_date date_of_birth  \
0  2.719000e+12             None      052589233  28.06.2023    21.06.1984   
1  2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Load the synthetic data (or use your actual DataFrame)
df = pd.read_csv('synthetic_retail_data.csv', encoding='utf-8-sig')

# Convert dates with dayfirst=True for European format
date_cols = ['date', 'issue_date', 'date_of_birth']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], dayfirst=True, errors='coerce')

# Filter only records with discount cards (real customers)
customer_df = df[df['discount_card'].notna()].copy()

# Set analysis date (1 day after last transaction)
analysis_date = customer_df['date'].max() + pd.Timedelta(days=1)

# RFM Calculation
rfm = customer_df.groupby('card_code').agg({
    'date': lambda x: (analysis_date - x.max()).days,  # Recency
    'discount_card': 'count',                          # Frequency
    'transaction_amount': 'sum'                        # Monetary
}).reset_index()

rfm.columns = ['card_code', 'recency', 'frequency', 'monetary']

# Create RFM segments
quantiles = rfm.quantile(q=[0.2, 0.4, 0.6, 0.8])

# Scoring functions
def r_score(x):
    if x <= quantiles['recency'][0.2]: return 5
    elif x <= quantiles['recency'][0.4]: return 4
    elif x <= quantiles['recency'][0.6]: return 3
    elif x <= quantiles['recency'][0.8]: return 2
    else: return 1

def fm_score(x, col):
    if x <= quantiles[col][0.2]: return 1
    elif x <= quantiles[col][0.4]: return 2
    elif x <= quantiles[col][0.6]: return 3
    elif x <= quantiles[col][0.8]: return 4
    else: return 5

# Apply scores
rfm['r_score'] = rfm['recency'].apply(r_score)
rfm['f_score'] = rfm['frequency'].apply(lambda x: fm_score(x, 'frequency'))
rfm['m_score'] = rfm['monetary'].apply(lambda x: fm_score(x, 'monetary'))

# Combine scores
rfm['rfm_score'] = rfm['r_score'].astype(str) + rfm['f_score'].astype(str) + rfm['m_score'].astype(str)
rfm['rfm_sum'] = rfm[['r_score', 'f_score', 'm_score']].sum(axis=1)

# Segmentation
segment_map = {
    r'555|554|545|455': 'Champions',
    r'[4-5][4-5][3-5]': 'Loyal Customers',
    r'[3-4][3-4][3-4]': 'Potential Loyalists',
    r'[3-5][1-3][1-3]': 'Recent Customers',
    r'[2-3][2-3][2-3]': 'Needing Attention',
    r'[1-2][1-2][1-2]': 'At Risk',
    r'1[1-3][1-3]': 'Hibernating',
    r'[1-2][1-2][4-5]': 'Cant Lose Them',
    r'[1-2]5[1-5]': 'Lost'
}

rfm['segment'] = rfm['rfm_score'].replace(segment_map, regex=True)
rfm['segment'] = rfm['segment'].fillna('Others')

# Merge with customer data
customer_details = customer_df[['card_code', 'gender', 'date_of_birth']].drop_duplicates()
rfm = rfm.merge(customer_details, on='card_code', how='left')

# Calculate age
rfm['age'] = (analysis_date - rfm['date_of_birth']).dt.days // 365

# # Visualization
# plt.figure(figsize=(15, 10))

# # RFM Distribution
# plt.subplot(2, 2, 1)
# sns.histplot(rfm['recency'], bins=30, kde=True)
# plt.title('Recency Distribution')

# plt.subplot(2, 2, 2)
# sns.histplot(rfm['frequency'], bins=30, kde=True)
# plt.title('Frequency Distribution')

# plt.subplot(2, 2, 3)
# sns.histplot(rfm['monetary'], bins=30, kde=True)
# plt.title('Monetary Distribution')

# plt.subplot(2, 2, 4)
# rfm['segment'].value_counts().plot(kind='bar')
# plt.title('Customer Segments')
# plt.xticks(rotation=45)

# plt.tight_layout()
# plt.show()

# Segment Analysis
segment_analysis = rfm.groupby('segment').agg({
    'recency': 'mean',
    'frequency': 'mean',
    'monetary': 'mean',
    'card_code': 'count',
    'age': 'mean'
}).rename(columns={'card_code': 'count'})

# Add percentage
segment_analysis['percentage'] = (segment_analysis['count'] / segment_analysis['count'].sum()) * 100
segment_analysis = segment_analysis.sort_values('count', ascending=False)

print("RFM Segment Analysis:")
print(segment_analysis)

# Save results
rfm.to_csv('rfm_results.csv', index=False, encoding='utf-8-sig')

# Action Recommendations
recommendations = {
    'Champions': "Reward them. Offer loyalty programs and exclusive previews",
    'Loyal Customers': "Upsell higher value products. Ask for reviews",
    'Potential Loyalists': "Offer membership/subscription or give them early access to new products",
    'Recent Customers': "Provide onboarding support and special offers to encourage repeat purchases",
    'Needing Attention': "Re-engage with email campaigns and recommendations based on past purchases",
    'At Risk': "Send personalized emails to win them back, offer discounts",
    'Hibernating': "Win them back with reactivation campaigns or surveys",
    'Cant Lose Them': "Make limited time offers and get feedback",
    'Lost': "Revive interest with reach out campaigns or ignore if not profitable"
}

print("\nMarketing Recommendations:")
for segment, advice in recommendations.items():
    if segment in segment_analysis.index:
        print(f"{segment}: {advice}")

RFM Segment Analysis:
                        recency  frequency      monetary  count        age  \
segment                                                                      
Recent Customers      77.542373   3.677966   3888.342542     59  51.461538   
At Risk              299.771429   2.771429   2617.686857     35  55.625000   
Champions             31.466667   8.000000  10778.649333     30  52.866667   
Potential Loyalists  103.636364   5.227273   6642.187273     22  53.714286   
Loyal Customers       43.909091   6.181818   6971.762273     22  51.333333   
Lost                 200.769231   8.384615  10351.552308     13  60.428571   
Hibernating          339.153846   4.461538   5798.478462     13  59.888889   
144                  293.000000   6.000000   7324.250000      6  60.333333   
355                  136.333333   8.666667  10088.140000      6  36.000000   
244                  195.600000   6.000000   7430.776000      5  63.333333   
Needing Attention    179.500000   5.000000