In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import os
import warnings
warnings.filterwarnings('ignore')

# Configuration
CONFIG = {
    'LIBRARIES_COUNT': 10,
    'USERS_COUNT': 500,
    'BOOKS_COUNT': 1000,
    'LOANS_COUNT': 300,
    'HOLDS_COUNT': 100
}

# Initialize
fake = Faker()
output_dir = 'data/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Generate Categories first (we need this for book IDs)
categories = pd.DataFrame([
    {'category_id': 1, 'category_name': "Science Fiction", 'description': "Futuristic and science-based fiction"},
    {'category_id': 2, 'category_name': "Mystery", 'description': "Crime and detective stories"},
    {'category_id': 3, 'category_name': "Romance", 'description': "Love stories and relationships"},
    {'category_id': 4, 'category_name': "Technology", 'description': "Books about technology and computing"},
    {'category_id': 5, 'category_name': "Biography", 'description': "Life stories and autobiographies"},
    {'category_id': 6, 'category_name': "History", 'description': "Historical events and periods"},
    {'category_id': 7, 'category_name': "Self Help", 'description': "Personal development and improvement"},
    {'category_id': 8, 'category_name': "Business", 'description': "Business and economics topics"},
    {'category_id': 9, 'category_name': "Literature", 'description': "Classic and contemporary literature"},
    {'category_id': 10, 'category_name': "Children", 'description': "Books for children"}
])
categories.to_csv(f"{output_dir}categories.csv", index=False)

# Generate Libraries
library_types = ["Public Library", "Municipal Library", "State Library", "Research Library"]

libraries = pd.DataFrame([
    {
        'library_id': i + 1,
        'library_name': f"{fake.city()} {random.choice(library_types)}",
        'address': fake.address().replace('\n', ', '),
        'phone': fake.phone_number(),
        'email': fake.email()
    }
    for i in range(CONFIG['LIBRARIES_COUNT'])
])
libraries.to_csv(f"{output_dir}libraries.csv", index=False)

# Generate Books
books = []
for i in range(CONFIG['BOOKS_COUNT']):
    category_name = random.choice(categories['category_name'].tolist())
    book_id = i + 1
    
    books.append({
        'book_id': book_id,
        'title': fake.catch_phrase(),
        'author': fake.name(),
        'isbn': fake.isbn13(),
        'publication_year': random.randint(1990, 2023),
        'category_id': categories[categories['category_name'] == category_name]['category_id'].iloc[0],
        'description': fake.text(max_nb_chars=200)
    })

books_df = pd.DataFrame(books)
books_df.to_csv(f"{output_dir}books.csv", index=False)

# Generate Users
users = []
used_usernames = set()
used_emails = set()
user_id = 1

while len(users) < CONFIG['USERS_COUNT']:
    # Generate base username from full name
    full_name = fake.name()
    base_username = full_name.lower().replace(' ', '_')
    
    # Generate unique username
    username = base_username
    counter = 1
    while username in used_usernames:
        username = f"{base_username}_{counter}"
        counter += 1
    
    # Generate unique email
    email_counter = 1
    email = f"{base_username}@{fake.free_email_domain()}"
    while email in used_emails:
        email = f"{base_username}{email_counter}@{fake.free_email_domain()}"
        email_counter += 1
    
    # Add to tracking sets
    used_usernames.add(username)
    used_emails.add(email)
    
    # Create user record
    users.append({
        'user_id': user_id,
        'username': username,
        'email': email,
        'full_name': full_name,
        'registration_date': fake.date_time_between(
            start_date='-1y', 
            end_date='now'
        ).strftime('%Y-%m-%d %H:%M:%S')
    })
    user_id += 1

users_df = pd.DataFrame(users)
users_df.to_csv(f"{output_dir}users.csv", index=False)

# Generate Library Books
library_books = []
library_book_id = 1

for _, library in libraries.iterrows():
    # Each library gets 60-80% of all books
    book_count = random.randint(
        int(0.6 * len(books_df)), 
        int(0.8 * len(books_df))
    )
    selected_books = books_df.sample(n=book_count)
    
    for _, book in selected_books.iterrows():
        quantity = random.randint(1, 5)
        available_quantity = random.randint(0, quantity)
        library_books.append({
            'library_book_id': library_book_id,
            'library_id': library['library_id'],
            'book_id': book['book_id'],
            'quantity': quantity,
            'available_quantity': available_quantity
        })
        library_book_id += 1

library_books_df = pd.DataFrame(library_books)
library_books_df.to_csv(f"{output_dir}library_books.csv", index=False)

# Generate Loans
loans = []
loan_id = 1

for _ in range(CONFIG['LOANS_COUNT']):
    loan_date = fake.date_time_between(
        start_date='-60d',
        end_date='now'
    )
    due_date = loan_date + timedelta(days=14)
    
    is_returned = random.random() < 0.7  # 70% chance of being returned
    if is_returned:
        return_date = loan_date + timedelta(days=random.randint(1, 14))
        status = 'RETURNED'
    else:
        return_date = None
        if datetime.now() > due_date:
            status = 'OVERDUE'
        else:
            status = 'ACTIVE'
    
    # Select a library book that has available copies
    available_books = library_books_df[library_books_df['available_quantity'] > 0]
    if len(available_books) > 0:
        book = available_books.sample(n=1).iloc[0]
        
        loans.append({
            'loan_id': loan_id,
            'user_id': random.choice([user['user_id'] for user in users]),
            'library_book_id': book['library_book_id'],
            'loan_date': loan_date.strftime('%Y-%m-%d %H:%M:%S'),
            'due_date': due_date.strftime('%Y-%m-%d %H:%M:%S'),
            'return_date': return_date.strftime('%Y-%m-%d %H:%M:%S') if return_date else None,
            'status': status
        })
        loan_id += 1

loans_df = pd.DataFrame(loans)
loans_df.to_csv(f"{output_dir}loans.csv", index=False)

# Generate Holds
holds = []
hold_id = 1
used_combinations = set()  # To track unique user-book combinations

while len(holds) < CONFIG['HOLDS_COUNT']:
    user_id = random.choice([user['user_id'] for user in users])

    # Filter for books with low availability
    unavailable_books = library_books_df[library_books_df['available_quantity'] <= 1]
    if len(unavailable_books) > 0:
        book = unavailable_books.sample(n=1).iloc[0]
        
        # Check for unique user-book combination
        combination = (user_id, book['library_book_id'])
        if combination not in used_combinations:
            used_combinations.add(combination)
            
            hold_date = fake.date_time_between(
                start_date='-14d',
                end_date='now'
            )
            expiry_date = hold_date + timedelta(days=7)
            
            # Calculate queue position
            existing_holds = len([
                h for h in holds 
                if h['library_book_id'] == book['library_book_id']
            ])
            
            holds.append({
                'hold_id': hold_id,
                'user_id': user_id,
                'library_book_id': book['library_book_id'],
                'hold_date': hold_date.strftime('%Y-%m-%d %H:%M:%S'),
                'expiry_date': expiry_date.strftime('%Y-%m-%d %H:%M:%S'),
                'status': random.choice(['PENDING', 'FULFILLED', 'EXPIRED']),
                'queue_position': existing_holds + 1
            })
            hold_id += 1

holds_df = pd.DataFrame(holds)
holds_df.to_csv(f"{output_dir}holds.csv", index=False)

# Print summary of generated data
print("Data Generation Summary:")
print(f"Libraries: {len(libraries)} records")
print(f"Categories: {len(categories)} records")
print(f"Books: {len(books_df)} records")
print(f"Users: {len(users)} records")
print(f"Library Books: {len(library_books_df)} records")
print(f"Loans: {len(loans_df)} records")
print(f"Holds: {len(holds_df)} records")

Data Generation Summary:
Libraries: 10 records
Categories: 10 records
Books: 1000 records
Users: 500 records
Library Books: 7127 records
Loans: 300 records
Holds: 100 records
