<a href="https://colab.research.google.com/github/Fahad-Blog/Data-Science-Portfolio/blob/main/ecommerce_data_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pandas numpy faker

Collecting faker
  Downloading faker-38.2.0-py3-none-any.whl.metadata (16 kB)
Downloading faker-38.2.0-py3-none-any.whl (2.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.0/2.0 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-38.2.0


The Below step creates dummified Dimension tables


In [2]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import date, timedelta
import random

fake = Faker()

# --- 1. Dim_Date (Required for all facts) ---
def generate_dim_date(start_date, end_date):
    date_list = pd.date_range(start_date, end_date).tolist()
    data = []
    for d in date_list:
        data.append({
            'date_key': int(d.strftime('%Y%m%d')),
            'full_date': d.date(),
            'year': d.year,
            'month': d.month,
            'day_of_week': d.day_of_week,
            'is_weekend': d.day_of_week >= 5 # 5=Sat, 6=Sun
        })
    return pd.DataFrame(data)

# --- 2. Dim_Product (50 SKUs) ---
def generate_dim_product(num_products=50):
    categories = ['Apparel', 'Accessories', 'Home Goods', 'Electronics']
    data = []
    for i in range(1, num_products + 1):
        sku = f"SKU-{1000 + i}"
        category = np.random.choice(categories, p=[0.4, 0.3, 0.2, 0.1])
        data.append({
            'product_key': i,
            'product_id': fake.uuid4(),
            'product_name': f"{category} Product {i}",
            'sku': sku,
            'category': category,
            'unit_cost': round(random.uniform(5.0, 50.0), 2)
        })
    df = pd.DataFrame(data)
    # Simulate a data quality issue: missing cost for a few products
    df.loc[df.sample(frac=0.04).index, 'unit_cost'] = np.nan
    return df

# --- 3. Dim_Customer (200 Customers) ---
def generate_dim_customer(num_customers=200):
    segments = ['New', 'Returning', 'VIP', 'Lapsed']
    data = []
    for i in range(1, num_customers + 1):
        data.append({
            'customer_key': i,
            'customer_id': fake.uuid4(),
            'first_name': fake.first_name(),
            'last_name': fake.last_name(),
            'email': fake.email(),
            'city': fake.city(),
            'country': 'USA', # Keep simple for now
            'segment': np.random.choice(segments, p=[0.3, 0.4, 0.2, 0.1])
        })
    df = pd.DataFrame(data)
    # Simulate data governance issue: duplicate customer records
    duplicate_rows = df.iloc[[5, 10, 15]].copy()
    duplicate_rows['customer_key'] = np.arange(201, 204) # Assign new keys
    df = pd.concat([df, duplicate_rows], ignore_index=True)
    return df.reset_index(drop=True)

# --- 4. Dim_Channel (Marketing Channels) ---
def generate_dim_channel():
    data = [
        {'channel_key': 1, 'channel_source': 'Google', 'channel_medium': 'cpc', 'campaign_name': 'Brand_Search'},
        {'channel_key': 2, 'channel_source': 'Facebook', 'channel_medium': 'cpc', 'campaign_name': 'Retargeting_Ads'},
        {'channel_key': 3, 'channel_source': 'Google', 'channel_medium': 'organic', 'campaign_name': 'SEO'},
        {'channel_key': 4, 'channel_source': 'Email', 'channel_medium': 'newsletter', 'campaign_name': 'Weekly_Promo'},
        {'channel_key': 5, 'channel_source': 'Direct', 'channel_medium': 'none', 'campaign_name': 'Direct_Entry'},
        {'channel_key': 6, 'channel_source': 'Affiliate', 'channel_medium': 'referral', 'campaign_name': 'Top_Bloggers'}
    ]
    return pd.DataFrame(data)

# --- Run Dimension Generation ---
start_date = date(2025, 10, 1)
end_date = date(2025, 12, 7)

df_dim_date = generate_dim_date(start_date, end_date)
df_dim_product = generate_dim_product()
df_dim_customer = generate_dim_customer()
df_dim_channel = generate_dim_channel()

print(f"Generated {len(df_dim_date)} dates, {len(df_dim_product)} products, {len(df_dim_customer)} customers.")

Generated 68 dates, 50 products, 203 customers.


In [3]:
# Import necessary libraries (ensure pandas, numpy, faker are installed)
import pandas as pd
import numpy as np
from faker import Faker
import random

# Reinitialize Faker
fake = Faker()

# --- 5. Fact_Order & Fact_Order_Item (3,000 Orders) ---
def generate_fact_orders(df_dim_date, df_dim_customer, df_dim_channel, df_dim_product, num_orders=3000):

    # --- 5a. Setup Valid Foreign Keys ---
    valid_dates = df_dim_date['date_key'].tolist()
    valid_customers = df_dim_customer['customer_key'].tolist()
    valid_channels = df_dim_channel['channel_key'].tolist()

    # 6 Channel Keys require 6 probabilities that sum to 1.0:
    channel_probabilities = [0.2, 0.15, 0.1, 0.1, 0.35, 0.1]

    # 5b. Fact_Order generation
    order_data = []

    for i in range(1, num_orders + 1):
        date_key = random.choice(valid_dates)
        customer_key = random.choice(valid_customers)

        # CORRECTED np.random.choice: Array size matches probability list size
        channel_key = np.random.choice(
            valid_channels,
            p=channel_probabilities
        )

        order_data.append({
            'order_key': i,
            'order_id': fake.bothify(text='ORD-########'),
            'date_key': date_key,
            'customer_key': customer_key,
            'channel_key': channel_key,
            'total_order_amount': 0.0, # Will be updated later
            'tax_amount': 0.0, # Will be updated later
            'shipping_cost': np.random.choice([0.0, 5.0, 10.0], p=[0.7, 0.2, 0.1]),
            'discount_amount': round(random.uniform(0.0, 50.0), 2) if random.random() < 0.2 else 0.0
        })

    df_fact_order = pd.DataFrame(order_data)

    # 5c. Fact_Order_Item generation
    item_data = []
    order_item_key = 1

    valid_products = df_dim_product['product_key'].tolist()
    # Create a dictionary for quick lookup of product costs (handles potential NaNs)
    product_costs = df_dim_product.set_index('product_key')['unit_cost'].to_dict()

    for index, order in df_fact_order.iterrows():
        num_items = random.randint(1, 3)
        order_total_gross = 0.0

        for _ in range(num_items):
            product_key = random.choice(valid_products)
            quantity = random.randint(1, 4)

            # Retrieve cost, defaulting to 25.0 if product_costs has a NaN value (The data quality issue)
            unit_price = product_costs.get(product_key)
            if pd.isna(unit_price):
                # Placeholder for null price handling
                unit_price = 25.0

            item_revenue = round(quantity * unit_price, 2)
            order_total_gross += item_revenue

            item_data.append({
                'order_item_key': order_item_key,
                'order_key': order['order_key'],
                'product_key': product_key,
                'quantity': quantity,
                'unit_price': unit_price,
                'gross_revenue': item_revenue
            })
            order_item_key += 1

        # Update Fact_Order total amount
        total_after_discount = order_total_gross - order['discount_amount']
        tax = round(total_after_discount * 0.08, 2)
        final_total = total_after_discount + tax + order['shipping_cost']

        df_fact_order.loc[index, 'total_order_amount'] = round(final_total, 2)
        df_fact_order.loc[index, 'tax_amount'] = tax

    df_fact_order_item = pd.DataFrame(item_data)

    return df_fact_order, df_fact_order_item

# --- 6. Fact_Inventory (Daily snapshot per product) ---
def generate_fact_inventory(df_dim_date, df_dim_product):
    inventory_data = []
    inventory_key = 1

    for date_key, date_row in df_dim_date.iterrows():
        for product_key in df_dim_product['product_key'].tolist():
            initial_stock = 100

            # Simple stock fluctuation simulation
            quantity_on_hand = max(0, initial_stock + random.randint(-20, 10))

            inventory_data.append({
                'inventory_key': inventory_key,
                'date_key': date_row['date_key'],
                'product_key': product_key,
                'quantity_on_hand': quantity_on_hand,
                'quantity_in_transit': random.randint(0, 50),
                'reorder_point': 50
            })
            inventory_key += 1

    return pd.DataFrame(inventory_data)

# --- 7. Fact_Traffic (Simulated Google Analytics hits) ---
def generate_fact_traffic(df_dim_date, df_dim_customer, df_dim_channel, num_sessions=50000):
    traffic_data = []

    valid_dates = df_dim_date['date_key'].tolist()
    valid_channels = df_dim_channel['channel_key'].tolist()

    # --- CORRECTED CUSTOMER SELECTION LOGIC ---

    # 1. Define the possible choices for customer_key: all valid keys + a placeholder for NaN
    valid_customers = df_dim_customer['customer_key'].tolist()
    N = len(valid_customers)

    # Choices: [1, 2, 3, ..., 200, 'UNIDENTIFIED']
    choices = valid_customers + ['UNIDENTIFIED']

    # Probabilities: 90% chance of being identified (spread across N customers), 10% chance of being unidentified
    P_identified = 0.9 / N
    probabilities = [P_identified] * N + [0.1]

    # Sanity Check: Ensure probabilities sum to 1.0
    if not 0.999 < sum(probabilities) < 1.001:
         raise ValueError("Traffic probabilities do not sum to 1.0")

    for i in range(1, num_sessions + 1):
        channel_key = random.choice(valid_channels)
        date_key = random.choice(valid_dates)

        # Select the choice using the correctly sized arrays
        selected_choice = np.random.choice(choices, p=probabilities)

        # Map the placeholder back to NaN/invalid key
        if selected_choice == 'UNIDENTIFIED':
            customer_key = np.nan
        else:
            customer_key = selected_choice

        traffic_data.append({
            'traffic_key': i,
            'date_key': date_key,
            'customer_key': customer_key, # This contains NaNs, ready for cleaning!
            'channel_key': channel_key,
            'session_id': fake.uuid4(),
            'page_views': random.randint(1, 10),
            'session_duration_seconds': random.randint(5, 300),
            'bounce_rate': round(random.random(), 2)
        })

    df = pd.DataFrame(traffic_data)
    # Simulate a key ETL issue: some customer keys are explicitly set to 0/invalid (not NaN)
    df.loc[df.sample(frac=0.01).index, 'customer_key'] = 0
    return df

# --- Execution Block (Place at the end of your complete script) ---
# NOTE: Replace with your actual dimension DataFrames if running as a standalone block!
# Example placeholder execution:
df_fact_order, df_fact_order_item = generate_fact_orders(df_dim_date, df_dim_customer, df_dim_channel, df_dim_product)
df_fact_inventory = generate_fact_inventory(df_dim_date, df_dim_product)
df_fact_traffic = generate_fact_traffic(df_dim_date, df_dim_customer, df_dim_channel)

print(f"Generated {len(df_fact_order)} orders, {len(df_fact_order_item)} order items, {len(df_fact_inventory)} inventory records, and {len(df_fact_traffic)} traffic sessions.")

Generated 3000 orders, 6067 order items, 3400 inventory records, and 50000 traffic sessions.


Convert the Generated File into CSV

In [4]:
# --- DEBUGGING VERSION ---
output_dir = 'C:/ecommerce_data'
import os

print(f"Attempting to create directory: {output_dir}")
os.makedirs(output_dir, exist_ok=True)
print("Directory creation check passed.")

# --- Export DataFrames to CSV ---
df_dim_date.to_csv(f'{output_dir}dim_date.csv', index=False)
df_dim_product.to_csv(f'{output_dir}dim_product.csv', index=False)
df_dim_customer.to_csv(f'{output_dir}dim_customer.csv', index=False)
df_dim_channel.to_csv(f'{output_dir}dim_channel_n.csv', index=False)

df_fact_order.to_csv(f'{output_dir}fact_order.csv', index=False)
df_fact_order_item.to_csv(f'{output_dir}fact_order_item.csv', index=False)
df_fact_inventory.to_csv(f'{output_dir}fact_inventory.csv', index=False)
df_fact_traffic.to_csv(f'{output_dir}fact_traffic.csv', index=False)

print(f"\nAll raw data files exported successfully to: {output_dir}")

print(f"\nSUCCESS! Check this exact path for files: {output_dir}")

Attempting to create directory: C:/ecommerce_data
Directory creation check passed.

All raw data files exported successfully to: C:/ecommerce_data

SUCCESS! Check this exact path for files: C:/ecommerce_data


# Task
Upload the `df_dim_channel` DataFrame to a BigQuery table named `dim_channel`. Specify your Google Cloud `project_id` and the `dataset_id` where the table should be created.

## Authenticate to Google Cloud

### Subtask:
Authenticate your Google Cloud account to allow access to BigQuery.


**Reasoning**:
The subtask requires authentication to Google Cloud, which can be done by importing the `google.colab.auth` module and calling its `authenticate_user()` function.



In [5]:
from google.colab import auth
auth.authenticate_user()

# Task
Install the `google-cloud-bigquery` library and then upload the `df_dim_channel` DataFrame to a BigQuery table named `dim_channel` within your specified "project_id" and "dataset_id".

## Install BigQuery Library

### Subtask:
Install the `google-cloud-bigquery` library to enable interaction with Google BigQuery.


**Reasoning**:
To interact with Google BigQuery, the `google-cloud-bigquery` library needs to be installed using pip.



In [6]:
get_ipython().system('pip install google-cloud-bigquery')



In [7]:
from google.cloud import bigquery
import pandas as pd
import pandas_gbq
import os

# --- 1. CONFIGURATION ---
project_id = 'ecommerce-data-480511'       # Your Google Cloud Project ID
dataset_id = 'ecommerce_data_17021995'     # Your BigQuery Dataset ID
# IMPORTANT: Update this path to the exact directory where you saved the CSVs
local_csv_path = '/content/C:'
# NOTE: Ensure you have the necessary authentication set up (e.g., gcloud auth application-default login)

# --- 2. LIST OF TABLES TO UPLOAD ---
# This list contains the base name of the CSV file and the target BigQuery table name
csv_to_table_map = [
    'ecommerce_datadim_date',
    'ecommerce_datadim_product',
    'ecommerce_datadim_customer',
    'ecommerce_datadim_channel_n',
    'ecommerce_datafact_order',
    'ecommerce_datafact_order_item',
    'ecommerce_datafact_inventory',
    'ecommerce_datafact_traffic',
]

# Initialize a BigQuery client
client = bigquery.Client(project=project_id)

# --- 3. UPLOAD PROCESS ---
print(f"üöÄ Starting bulk upload to BigQuery Dataset: {dataset_id}")
print("-" * 50)

for table_name in csv_to_table_map:
    csv_file = f"{table_name}.csv"
    full_csv_path = os.path.join(local_csv_path, csv_file)
    full_table_id = f"{project_id}.{dataset_id}.{table_name}"

    try:
        # Step A: Read the local CSV file into a pandas DataFrame
        # We assume the columns generated are consistent with the data types
        df_to_upload = pd.read_csv(full_csv_path)

        # Step B: Upload the DataFrame to BigQuery
        # if_exists='replace' ensures a clean load for the raw data layer
        pandas_gbq.to_gbq(
            df_to_upload,
            full_table_id,
            project_id=project_id,
            if_exists='replace',
            progress_bar=True
        )

        print(f"‚úÖ Successfully uploaded {csv_file} ({len(df_to_upload):,} rows) to: {full_table_id}")

    except FileNotFoundError:
        print(f"‚ùå Error: CSV file not found at {full_csv_path}. Skipping.")
    except Exception as e:
        print(f"‚ùå Failed to upload {csv_file}. Error: {e}")

print("-" * 50)
print("‚úÖ Bulk upload process complete.")

üöÄ Starting bulk upload to BigQuery Dataset: ecommerce_data_17021995
--------------------------------------------------


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 10155.70it/s]


‚úÖ Successfully uploaded ecommerce_datadim_date.csv (68 rows) to: ecommerce-data-480511.ecommerce_data_17021995.ecommerce_datadim_date


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 9686.61it/s]


‚úÖ Successfully uploaded ecommerce_datadim_product.csv (50 rows) to: ecommerce-data-480511.ecommerce_data_17021995.ecommerce_datadim_product


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 11397.57it/s]


‚úÖ Successfully uploaded ecommerce_datadim_customer.csv (203 rows) to: ecommerce-data-480511.ecommerce_data_17021995.ecommerce_datadim_customer


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 13573.80it/s]


‚úÖ Successfully uploaded ecommerce_datadim_channel_n.csv (6 rows) to: ecommerce-data-480511.ecommerce_data_17021995.ecommerce_datadim_channel_n


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 8289.14it/s]


‚úÖ Successfully uploaded ecommerce_datafact_order.csv (3,000 rows) to: ecommerce-data-480511.ecommerce_data_17021995.ecommerce_datafact_order


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 12336.19it/s]


‚úÖ Successfully uploaded ecommerce_datafact_order_item.csv (6,067 rows) to: ecommerce-data-480511.ecommerce_data_17021995.ecommerce_datafact_order_item


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 13486.51it/s]


‚úÖ Successfully uploaded ecommerce_datafact_inventory.csv (3,400 rows) to: ecommerce-data-480511.ecommerce_data_17021995.ecommerce_datafact_inventory


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 13273.11it/s]

‚úÖ Successfully uploaded ecommerce_datafact_traffic.csv (50,000 rows) to: ecommerce-data-480511.ecommerce_data_17021995.ecommerce_datafact_traffic
--------------------------------------------------
‚úÖ Bulk upload process complete.





In [10]:
import os

PROJECT_ID = "ecommerce-data-480511"
PROFILE_NAME = "ecommerce_portfolio"
PROJECT_DIR = "my_portfolio"


!mkdir -p ~/.dbt

profiles_content = f"""
{PROFILE_NAME}:
  target: dev
  outputs:
    dev:
      type: bigquery
      method: oauth
      project: {PROJECT_ID}
      dataset: analytics_portfolio
      threads: 4
      timeout_seconds: 300
      location: US
"""

with open(os.path.expanduser('~/.dbt/profiles.yml'), 'w') as f:
    f.write(profiles_content)
print("‚úÖ profiles.yml recreated successfully.")


os.makedirs(f"{PROJECT_DIR}/models", exist_ok=True)

dbt_project_content = f"""
name: 'my_portfolio'
version: '1.0.0'
config-version: 2

# This MUST match the profile name defined above
profile: '{PROFILE_NAME}'

model-paths: ["models"]
analysis-paths: ["analyses"]
test-paths: ["tests"]
seed-paths: ["seeds"]
macro-paths: ["macros"]
snapshot-paths: ["snapshots"]

target-path: "target"
clean-targets:
  - "target"
  - "dbt_packages"

models:
  my_portfolio:
    staging:
      +materialized: view
    marts:
      +materialized: table
"""

with open(f"{PROJECT_DIR}/dbt_project.yml", 'w') as f:
    f.write(dbt_project_content)
print("‚úÖ dbt_project.yml recreated successfully.")


print("\nüîÑ Running dbt debug...")

%cd {PROJECT_DIR}
!dbt debug

‚úÖ profiles.yml recreated successfully.
‚úÖ dbt_project.yml recreated successfully.

üîÑ Running dbt debug...
/content/my_portfolio/my_portfolio
[0m17:43:12  Running with dbt=1.11.0-rc2
[0m17:43:12  dbt version: 1.11.0-rc2
[0m17:43:12  python version: 3.12.12
[0m17:43:12  python path: /usr/bin/python3
[0m17:43:12  os info: Linux-6.6.105+-x86_64-with-glibc2.35
[0m17:43:18  Using profiles dir at /root/.dbt
[0m17:43:18  Using profiles.yml file at /root/.dbt/profiles.yml
[0m17:43:18  Using dbt_project.yml file at /content/my_portfolio/my_portfolio/dbt_project.yml
[0m17:43:18  adapter type: bigquery
[0m17:43:18  adapter version: 1.10.3
[0m17:43:19  Configuration:
[0m17:43:19    profiles.yml file [[32mOK found and valid[0m]
[0m17:43:19    dbt_project.yml file [[32mOK found and valid[0m]
[0m17:43:19  Required dependencies:
[0m17:43:19   - git [[32mOK found[0m]

[0m17:43:19  Connection:
[0m17:43:19    method: oauth
[0m17:43:19    database: ecommerce-data-480511
[0m17