In [None]:
#Synthetic Data Generation for Hotel Bookings and FinTech Transactions

import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker('en_US')

# 1. Hotel Master Data (Dimension Table)
hotels = [
    {"Hotel_ID": "SAN", "Municipality": "San Candido", "Type": "Luxury Hotel", "Price_Range": "High"},
    {"Hotel_ID": "GAL", "Municipality": "Gallipoli", "Type": "Beach Resort", "Price_Range": "Medium"},
    {"Hotel_ID": "LAZ", "Municipality": "Lazise", "Type": "Boutique Hotel", "Price_Range": "Medium-High"},
    {"Hotel_ID": "ORO", "Municipality": "Orosei", "Type": "Family Club", "Price_Range": "Medium"},
    {"Hotel_ID": "CAP", "Municipality": "Capoliveri", "Type": "Residence", "Price_Range": "Medium-High"}
]
df_master = pd.DataFrame(hotels)

# 2. Raw Bookings Data (1500 rows)
data_bookings = []
for _ in range(1500):
    h = random.choice(hotels)
    # Price logic based on range
    base_p = {"High": 450, "Medium-High": 280, "Medium": 160}[h["Price_Range"]]
    
    duration = random.randint(1, 12)
    # Introducing "dirty" data: negative stay duration
    if random.random() < 0.02: duration = -1 
    
    amount = base_p * duration * random.uniform(0.85, 1.15)
    # Introducing "dirty" data: missing values
    if random.random() < 0.04: amount = np.nan 
    
    # Introducing "dirty" data: inconsistent hotel IDs
    hotel_id_val = h["Hotel_ID"]
    if random.random() < 0.02: hotel_id_val = "H_UNK" 

    data_bookings.append([
        f"BK_{fake.unique.random_int(10000, 99999)}", 
        hotel_id_val, 
        random.choice([fake.name().upper(), fake.name().lower(), fake.name()]),
        fake.date_between(start_date='-1y', end_date='today'),
        duration, 
        round(amount, 2) if not np.isnan(amount) else np.nan
    ])

df_bookings = pd.DataFrame(data_bookings, columns=['Booking_ID', 'Hotel_ID', 'Customer_Name', 'Booking_Date', 'Nights', 'Total_Amount'])

# 3. FinTech Transactions Data
data_trans = []
for _, row in df_bookings.iterrows():
    status = "Success"
    country = random.choices(["IT", "DE", "US", "RU", "UK", "FR"], weights=[65, 15, 8, 4, 4, 4])[0]
    method = random.choice(["Credit Card", "PayPal", "Bank Transfer"])
    
    risk_trigger = random.random()
    
    # RULE 1: High value fraud (San Candido + US/RU)
    if (row['Hotel_ID'] == "SAN" and country in ["US", "RU"] and risk_trigger > 0.75):
        status = "Chargeback"
    
    # RULE 2: Card testing with small amounts (Gallipoli + Credit Card + many small failures)
    elif (row['Hotel_ID'] == "GAL" and method == "Credit Card" and risk_trigger > 0.90):
        status = "Failed"
    
    # RULE 3: General friction (Low chance of failure for everyone)
    elif risk_trigger > 0.97:
        status = "Failed"
        
    data_trans.append([
        f"TR_{fake.unique.random_int(100000, 999999)}", 
        row['Booking_ID'], 
        method,
        country, 
        status,
        fake.ipv4()
    ])

df_trans = pd.DataFrame(data_trans, columns=['Transaction_ID', 'Booking_ID', 'Payment_Method', 'Country', 'Status', 'IP_Address'])

# 4. Save to .csv
df_master.to_csv("Hotel_Master.csv", index=False)
df_bookings.to_csv("Bookings_Raw.csv", index=False)
df_trans.to_csv("Transactions_Fintech.csv", index=False)