<a href="https://colab.research.google.com/github/AnshuKamath/DB-Analytics-Assignment/blob/main/Section2_Python_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Section 2: Bookstore Management Information System
# Optimizing Operations and Customer Engagement through Data Analytics

In [1]:
# Part 1: Importing and Combining Data

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)

# Set plotting style
plt.style.use('ggplot')
sns.set(style="whitegrid")

# Step 1: Import the "bookstore_transactions.csv" data
print("Step 1: Importing the dataset...")
try:
    # Try to import from local file
    bookstore_data = pd.read_csv("bookstore_transactions.csv")
    print("Dataset loaded successfully from local file.")
except FileNotFoundError:
    # If file not found, try GitHub URL
    print("Local file not found. Attempting to load from GitHub...")
    github_url = "https://raw.githubusercontent.com/AnshuKamath/DB-Analytics-Assignment/main/bookstore_transactions.csv"
    try:
        bookstore_data = pd.read_csv(github_url)
        print("Dataset loaded successfully from GitHub.")
    except:
        print("Error: Could not load dataset from GitHub either.")
        # Create a small sample dataset for demonstration purposes
        print("Creating sample dataset for demonstration...")
        bookstore_data = pd.DataFrame({
            'transaction_id': range(1, 101),
            'customer_id': np.random.randint(1, 501, 100),
            'book_id': np.random.randint(1, 1001, 100),
            'purchase_date': pd.date_range(start='2023-01-01', periods=100),
            'purchase_location': np.random.choice(['In-Store', 'Online'], 100),
            'quantity': np.random.randint(1, 6, 100),
            'unit_price': np.random.uniform(10, 50, 100),
            'payment_method': np.random.choice(['Cash', 'Card', 'Online'], 100),
            'loyalty_points_earned': np.random.randint(0, 500, 100),
            'promotional_offer': np.random.choice(['None', 'Buy One Get One Free', '50% Discount', 'Double Points'], 100)
        })
        print("Sample dataset created.")

# Display basic information about the dataset
print("\nBasic information about the dataset:")
print(f"Number of rows: {bookstore_data.shape[0]}")
print(f"Number of columns: {bookstore_data.shape[1]}")

# Display the first few rows to verify data import
print("\nFirst 5 rows of the dataset:")
print(bookstore_data.head())

# Display the column names
print("\nColumn names in the dataset:")
print(bookstore_data.columns.tolist())

# Check data types
print("\nData types of each column:")
print(bookstore_data.dtypes)

# Check for missing values
print("\nMissing values in each column:")
print(bookstore_data.isnull().sum())

# Step 2: Combine multiple data files (if available)
# Note: In a real scenario, multiple related files would be imported and combined
# For demonstration, we'll simulate this by creating additional dataframes

print("\nStep 2: Combining multiple data files...")

# Check if we have already combined data
if 'combined_data' not in locals():
    try:
        # Try to load additional related files (if they exist)
        # These would be files like customer_data.csv, book_inventory.csv, etc.
        try:
            customer_data = pd.read_csv("customer_data.csv")
            print("Customer data loaded successfully.")
        except FileNotFoundError:
            # Create sample customer data
            print("Customer data file not found. Creating sample customer data...")
            customer_ids = np.unique(bookstore_data['customer_id'].dropna().astype(int).values)
            customer_data = pd.DataFrame({
                'customer_id': customer_ids,
                'customer_name': ['Customer_' + str(id) for id in customer_ids],
                'email': ['customer' + str(id) + '@example.com' for id in customer_ids],
                'registration_date': pd.date_range(start='2020-01-01', periods=len(customer_ids)),
                'is_member': np.random.choice([True, False], len(customer_ids))
            })

        try:
            book_data = pd.read_csv("book_inventory.csv")
            print("Book inventory data loaded successfully.")
        except FileNotFoundError:
            # Create sample book data
            print("Book inventory file not found. Creating sample book data...")
            book_ids = np.unique(bookstore_data['book_id'].dropna().astype(int).values)
            book_data = pd.DataFrame({
                'book_id': book_ids,
                'title': ['Book_' + str(id) for id in book_ids],
                'author': ['Author_' + str(np.random.randint(1, 100)) for _ in book_ids],
                'genre': np.random.choice(['Fiction', 'Non-Fiction', 'Science', 'History', 'Biography'], len(book_ids)),
                'price': np.random.uniform(10, 50, len(book_ids)),
                'stock': np.random.randint(0, 100, len(book_ids))
            })

        # Combine the dataframes
        # We'll merge them based on common keys
        # First, merge transaction data with customer data
        print("Combining transaction data with customer data...")
        if 'customer_id' in bookstore_data.columns and 'customer_id' in customer_data.columns:
            combined_data = pd.merge(
                bookstore_data,
                customer_data,
                on='customer_id',
                how='left'
            )
            print("Transaction and customer data combined successfully.")
        else:
            combined_data = bookstore_data.copy()
            print("Cannot merge customer data due to missing common key.")

        # Then, merge with book data
        print("Combining with book inventory data...")
        if 'book_id' in combined_data.columns and 'book_id' in book_data.columns:
            combined_data = pd.merge(
                combined_data,
                book_data,
                on='book_id',
                how='left'
            )
            print("Book inventory data combined successfully.")
        else:
            print("Cannot merge book data due to missing common key.")

        print("Data combination complete.")
    except Exception as e:
        print(f"Error in combining data: {e}")
        # If there's an error, just use the original dataset
        combined_data = bookstore_data.copy()
        print("Using original dataset as combined data due to errors.")
else:
    print("Combined data already exists.")

# Display information about the combined dataset
print("\nCombined Dataset Information:")
print(f"Number of rows: {combined_data.shape[0]}")
print(f"Number of columns: {combined_data.shape[1]}")

# Display the first few rows of the combined dataset
print("\nFirst 5 rows of the combined dataset:")
print(combined_data.head())

# Display the column names of the combined dataset
print("\nColumn names in the combined dataset:")
print(combined_data.columns.tolist())

# Step 3: Verify data integrity after combination
print("\nStep 3: Verifying data integrity after combination...")

# Check for missing values in the combined dataset
missing_values = combined_data.isnull().sum()
print("\nMissing values in each column of the combined dataset:")
print(missing_values)

# Calculate the percentage of missing values
missing_percentage = (missing_values / len(combined_data)) * 100
print("\nPercentage of missing values in each column:")
print(missing_percentage)

# Check for duplicates
duplicates = combined_data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Check for any anomalies in numeric columns
print("\nStatistical summary of numeric columns:")
numeric_cols = combined_data.select_dtypes(include=[np.number]).columns
print(combined_data[numeric_cols].describe())

# Save the combined dataset for use in subsequent parts
combined_data.to_csv("combined_bookstore_data.csv", index=False)
print("\nCombined dataset saved as 'combined_bookstore_data.csv'")

print("\nData import and combination process completed successfully.")

Step 1: Importing the dataset...
Local file not found. Attempting to load from GitHub...
Dataset loaded successfully from GitHub.

Basic information about the dataset:
Number of rows: 3500
Number of columns: 14

First 5 rows of the dataset:
  Transaction ID Customer ID            Book Title          Author      Genre Purchase Method  Payment Method  Stock Before  Stock After  Loyalty Points        Promotion Applied  Discount Applied Order Status Restock Triggered
0       59b278fe      914e0b                  1984   George Orwell  Dystopian          Online            Cash            25           22              30  Buy 3 Books, Get 1 Free                 0    Completed                No
1       8f90d6f6      c8d80d              Becoming  Michelle Obama  Biography          Online  Online Payment            14           12              20  Buy 3 Books, Get 1 Free                 0    Completed                No
2       a116662e      dba9d8  The Midnight Library       Matt Haig    Fantasy 