In [3]:
pip install pandas numpy Faker


Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Using cached pandas-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting Faker
  Downloading faker-37.4.2-py3-none-any.whl.metadata (15 kB)
Collecting numpy
  Using cached numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting python-dateutil>=2.8.2 (from pandas)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m36m0:00:01[0m:01[0m
[?25hUsing cached numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
Do

In [1]:
import pandas as pd
import numpy as np
import json
from faker import Faker
import random
import os

fake = Faker()

# --- PARAMETERS ---
transactions_rows = 220_000  # 1/10th of 2.2M
items_rows = 500             # reasonable number for items
cities_rows = 100            # cities metadata

# --- Generate items.csv ---
def generate_items(n):
    categories = ['Stationery', 'Electronics', 'Home & Kitchen', 'Toys', 'Books', 'Clothing']
    suppliers = [fake.company() for _ in range(50)]
    data = {
        'item_id': list(range(1000, 1000 + n)),
        'item_name': [fake.word().capitalize() for _ in range(n)],
        'category': np.random.choice(categories, n),
        'supplier': np.random.choice(suppliers, n),
        'price_usd': np.round(np.random.uniform(1, 1000, n), 2),
        'weight_kg': np.round(np.random.uniform(0.01, 10, n), 3),
        'color': np.random.choice(['Red', 'Green', 'Blue', 'Black', 'White', 'Yellow'], n),
        'warranty_years': np.random.choice([0, 1, 2, 3, 5], n),
        'release_year': np.random.choice(range(2000, 2024), n),
        'rating': np.round(np.random.uniform(1, 5, n), 1)
    }
    df = pd.DataFrame(data)

    # Introduce nulls randomly
    for col in df.columns:
        df.loc[df.sample(frac=0.05).index, col] = np.nan

    # Introduce duplicates
    df = pd.concat([df, df.sample(frac=0.02)], ignore_index=True)
    df.reset_index(drop=True, inplace=True)

    # Type enforcement
    df['item_id'] = df['item_id'].astype('Int64')
    df['release_year'] = df['release_year'].astype('Int64')
    df['warranty_years'] = df['warranty_years'].astype('Int64')

    return df

# --- Generate cities.json ---
def generate_cities(n):
    states = list(set([fake.state_abbr() for _ in range(n * 2)]))
    countries = ['USA', 'Canada', 'Mexico']
    cities = []

    for i in range(n):
        city = {
            'city_id': int(100 + i),
            'city_name': fake.city(),
            'state': np.random.choice(states),
            'country': np.random.choice(countries),
            'population': int(np.random.normal(500000, 300000)),
            'area_sq_km': round(np.random.uniform(50, 5000), 2),
            'average_income_usd': int(np.random.normal(50000, 15000)),
            'founded_year': random.randint(1700, 2020),
            'time_zone': fake.timezone(),
            'climate': np.random.choice(['Temperate', 'Tropical', 'Arid', 'Continental', 'Polar'])
        }
        cities.append(city)

    # Introduce nulls
    for _ in range(5):
        cities[random.randint(0, n - 1)]['average_income_usd'] = None

    # Introduce anomalies (invalid states)
    for _ in range(3):
        idx = random.randint(0, n - 1)
        cities[idx]['state'] = 'ZZ'  # invalid state code

    # Introduce duplicates
    cities.extend(random.sample(cities, 3))

    return cities

# --- Generate transactions.csv ---
def generate_transactions(n, item_ids, city_ids):
    payment_methods = ['Credit Card', 'Debit Card', 'Cash', 'Mobile Payment', 'Gift Card']
    order_statuses = ['Completed', 'Pending', 'Cancelled', 'Returned']

    data = {
        'transaction_id': list(range(1, n + 1)),
        'item_id': np.random.choice(item_ids, n),
        'city_id': np.random.choice(city_ids, n),
        'quantity': np.random.poisson(2, n),
        'price_usd': np.round(np.random.uniform(5, 500, n), 2),
        'discount_percent': np.round(np.random.uniform(0, 30, n), 1),
        'payment_method': np.random.choice(payment_methods, n),
        'order_status': np.random.choice(order_statuses, n),
        'customer_age': np.random.randint(18, 80, n),
        'transaction_date': [fake.date_between(start_date='-1y', end_date='today').isoformat() for _ in range(n)]
    }

    df = pd.DataFrame(data)

    # Introduce nulls
    for col in ['quantity', 'price_usd', 'discount_percent']:
        df.loc[df.sample(frac=0.02).index, col] = np.nan

    # Introduce duplicates
    df = pd.concat([df, df.sample(frac=0.02)], ignore_index=True)

    # Introduce anomalies: negative prices and quantities
    anomaly_indices = df.sample(frac=0.01).index
    df.loc[anomaly_indices, 'quantity'] = -df.loc[anomaly_indices, 'quantity'].abs()
    df.loc[anomaly_indices, 'price_usd'] = -df.loc[anomaly_indices, 'price_usd'].abs()

    df.reset_index(drop=True, inplace=True)

    # Type enforcement
    df['transaction_id'] = df['transaction_id'].astype('Int64')
    df['item_id'] = df['item_id'].astype('Int64')
    df['city_id'] = df['city_id'].astype('Int64')
    df['quantity'] = df['quantity'].astype('Int64')
    df['customer_age'] = df['customer_age'].astype('Int64')
    df['transaction_date'] = df['transaction_date'].astype('string')

    return df

# Generate datasets
print("Generating items...")
items_df = generate_items(items_rows)

print("Generating cities...")
cities_list = generate_cities(cities_rows)
city_ids = [city['city_id'] for city in cities_list]

print("Generating transactions...")
transactions_df = generate_transactions(transactions_rows, items_df['item_id'].dropna().astype(int).tolist(), city_ids)

# Save to files
print("Saving datasets...")
items_df.to_csv('items.csv', index=False)
with open('cities.json', 'w') as f:
    json.dump(cities_list, f, indent=2)
transactions_df.to_csv('transactions.csv', index=False)

print(f"Files saved in {os.getcwd()}")
print("Datasets generated successfully.")


Generating items...
Generating cities...
Generating transactions...
Saving datasets...
Files saved in /home/user/Documents/retail inventory optimisation
Datasets generated successfully.


In [3]:
pip install pyspark

Defaulting to user installation because normal site-packages is not writeable
    sys-platform (=="darwin") ; extra == 'objc'
                 ~^[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("Raw Ingestion - Sprint 1") \
    .enableHiveSupport() \
    .getOrCreate()

# Define schemas
transactions_schema = StructType([
    StructField("transaction_id", IntegerType(), True),
    StructField("item_id", IntegerType(), True),
    StructField("city_id", IntegerType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price_usd", DoubleType(), True),
    StructField("discount_percent", DoubleType(), True),
    StructField("payment_method", StringType(), True),
    StructField("order_status", StringType(), True),
    StructField("customer_age", IntegerType(), True),
    StructField("transaction_date", StringType(), True)
])

items_schema = StructType([
    StructField("item_id", IntegerType(), True),
    StructField("item_name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("supplier", StringType(), True),
    StructField("price_usd", DoubleType(), True),
    StructField("weight_kg", DoubleType(), True),
    StructField("color", StringType(), True),
    StructField("warranty_years", IntegerType(), True),
    StructField("release_year", IntegerType(), True),
    StructField("rating", DoubleType(), True)
])

cities_schema = StructType([
    StructField("city_id", IntegerType(), True),
    StructField("city_name", StringType(), True),
    StructField("state", StringType(), True),
    StructField("country", StringType(), True),
    StructField("population", IntegerType(), True),
    StructField("area_sq_km", DoubleType(), True),
    StructField("average_income_usd", IntegerType(), True),
    StructField("founded_year", IntegerType(), True),
    StructField("time_zone", StringType(), True),
    StructField("climate", StringType(), True)
])

# Load CSVs
transactions_df = spark.read.csv(
    "/home/user/Documents/retail inventory optimisation/transactions.csv",
    schema=transactions_schema,
    header=True,
    sep=","
)

items_df = spark.read.csv(
    "/home/user/Documents/retail inventory optimisation/items.csv",
    schema=items_schema,
    header=True,
    sep=","
)

cities_df = spark.read.json(  # assuming JSON for cities
    "/home/user/Documents/retail inventory optimisation/cities.json",
    schema=cities_schema,
    multiLine=True
)

# Register Hive External Tables
transactions_df.write.mode("overwrite").option("path", "/home/user/Documents/retail inventory optimisation/write_folder/") \
    .saveAsTable("default.transactions_raw")

items_df.write.mode("overwrite").option("path", "/home/user/Documents/retail inventory optimisation/write_folder/") \
    .saveAsTable("default.items_raw")

cities_df.write.mode("overwrite").option("path", "/home/user/Documents/retail inventory optimisation/write_folder/") \
    .saveAsTable("default.cities_raw")

# Schema Validation Report
print("===== Transactions Schema =====")
transactions_df.printSchema()
transactions_df.describe().show()

print("===== Items Schema =====")
items_df.printSchema()
items_df.describe().show()

print("===== Cities Schema =====")
cities_df.printSchema()
cities_df.describe().show()


25/07/19 12:09:50 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
25/07/19 12:09:50 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
25/07/19 12:09:50 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/07/19 12:09:50 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
25/07/19 12:09:50 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


===== Transactions Schema =====
root
 |-- transaction_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- city_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price_usd: double (nullable = true)
 |-- discount_percent: double (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- customer_age: integer (nullable = true)
 |-- transaction_date: string (nullable = true)



25/07/19 12:09:52 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------+------------------+-------+------------------+--------+------------------+------------------+--------------+------------+------------------+----------------+
|summary|    transaction_id|item_id|           city_id|quantity|         price_usd|  discount_percent|payment_method|order_status|      customer_age|transaction_date|
+-------+------------------+-------+------------------+--------+------------------+------------------+--------------+------------+------------------+----------------+
|  count|            224400|      0|            224400|       0|            219927|            219903|        224400|      224400|            224400|          224400|
|   mean|110005.08828431372|   NULL|149.89722816399288|    NULL|  247.412380017004|15.027731772645248|          NULL|        NULL|  48.3853431372549|            NULL|
| stddev| 63506.86883253797|   NULL|29.189391851951843|    NULL|151.42722574957196| 8.656875055261963|          NULL|        NULL|17.881676825124217|            NULL