In [None]:
pip install pandas numpy Faker


In [None]:
import pandas as pd
import numpy as np
import json
from faker import Faker
import random
import os

fake = Faker()

# --- PARAMETERS ---
transactions_rows = 220_000  # 1/10th of 2.2M
items_rows = 500             # reasonable number for items
cities_rows = 100            # cities metadata

# --- Generate items.csv ---
def generate_items(n):
    categories = ['Stationery', 'Electronics', 'Home & Kitchen', 'Toys', 'Books', 'Clothing']
    suppliers = [fake.company() for _ in range(50)]
    data = {
        'item_id': range(1000, 1000 + n),
        'item_name': [fake.word().capitalize() for _ in range(n)],
        'category': np.random.choice(categories, n),
        'supplier': np.random.choice(suppliers, n),
        'price_usd': np.round(np.random.uniform(1, 1000, n), 2),
        'weight_kg': np.round(np.random.uniform(0.01, 10, n), 3),
        'color': np.random.choice(['Red', 'Green', 'Blue', 'Black', 'White', 'Yellow'], n),
        'warranty_years': np.random.choice([0,1,2,3,5], n),
        'release_year': np.random.choice(range(2000, 2024), n),
        'rating': np.round(np.random.uniform(1, 5, n), 1)
    }
    df = pd.DataFrame(data)
    # Introduce nulls randomly
    for col in df.columns:
        df.loc[df.sample(frac=0.05).index, col] = np.nan
    # Introduce duplicates
    df = pd.concat([df, df.sample(frac=0.02)], ignore_index=True)
    df.reset_index(drop=True, inplace=True)
    return df

# --- Generate cities.json ---
def generate_cities(n):
    states = list(set([fake.state_abbr() for _ in range(n*2)]))
    countries = ['USA', 'Canada', 'Mexico']
    cities = []
    for i in range(n):
        city = {
            'city_id': 100 + i,
            'city_name': fake.city(),
            'state': np.random.choice(states),
            'country': np.random.choice(countries),
            'population': int(np.random.normal(500000, 300000)),
            'area_sq_km': round(np.random.uniform(50, 5000), 2),
            'average_income_usd': int(np.random.normal(50000, 15000)),
            'founded_year': random.randint(1700, 2020),
            'time_zone': fake.timezone(),
            'climate': np.random.choice(['Temperate', 'Tropical', 'Arid', 'Continental', 'Polar'])
        }
        cities.append(city)
    # Introduce nulls
    for _ in range(5):
        cities[random.randint(0, n-1)]['average_income_usd'] = None
    # Introduce anomalies (location mismatch): change a city's state to wrong one randomly
    for _ in range(3):
        idx = random.randint(0, n-1)
        cities[idx]['state'] = 'ZZ'  # invalid state code
    # Introduce duplicates
    cities.extend(random.sample(cities, 3))
    return cities

# --- Generate transactions.csv ---
def generate_transactions(n, item_ids, city_ids):
    payment_methods = ['Credit Card', 'Debit Card', 'Cash', 'Mobile Payment', 'Gift Card']
    order_status = ['Completed', 'Pending', 'Cancelled', 'Returned']
    data = {
        'transaction_id': range(1, n + 1),
        'item_id': np.random.choice(item_ids, n),
        'city_id': np.random.choice(city_ids, n),
        'quantity': np.random.poisson(2, n),
        'price_usd': np.round(np.random.uniform(5, 500, n), 2),
        'discount_percent': np.round(np.random.uniform(0, 30, n), 1),
        'payment_method': np.random.choice(payment_methods, n),
        'order_status': np.random.choice(order_status, n),
        'customer_age': np.random.randint(18, 80, n),
        'transaction_date': [fake.date_between(start_date='-1y', end_date='today').isoformat() for _ in range(n)]
    }
    df = pd.DataFrame(data)
    # Introduce nulls
    for col in ['quantity', 'price_usd', 'discount_percent']:
        df.loc[df.sample(frac=0.02).index, col] = np.nan
    # Introduce duplicates (2% duplicates)
    df = pd.concat([df, df.sample(frac=0.02)], ignore_index=True)
    # Introduce anomalies (e.g., negative quantities or prices)
    anomaly_indices = df.sample(frac=0.01).index
    df.loc[anomaly_indices, 'quantity'] = -df.loc[anomaly_indices, 'quantity'].abs()
    df.loc[anomaly_indices, 'price_usd'] = -df.loc[anomaly_indices, 'price_usd'].abs()
    df.reset_index(drop=True, inplace=True)
    return df

# Generate datasets
print("Generating items...")
items_df = generate_items(items_rows)

print("Generating cities...")
cities_list = generate_cities(cities_rows)

print("Generating transactions...")
transactions_df = generate_transactions(transactions_rows, items_df['item_id'].tolist(), [city['city_id'] for city in cities_list])

# Save to files
print("Saving datasets...")

items_df.to_csv('items.csv', index=False)
with open('cities.json', 'w') as f:
    json.dump(cities_list, f, indent=2)
transactions_df.to_csv('transactions.csv', index=False)

print(f"Files saved in {os.getcwd()}")
print("Datasets generated successfully.")


In [None]:
pip install pyspark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("Raw Ingestion - Sprint 1") \
    .enableHiveSupport() \
    .getOrCreate()

# Define schemas
transactions_schema = StructType([
    StructField("transaction_id", IntegerType(), True),
    StructField("item_id", IntegerType(), True),
    StructField("city_id", IntegerType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price_usd", DoubleType(), True),
    StructField("discount_percent", DoubleType(), True),
    StructField("payment_method", StringType(), True),
    StructField("order_status", StringType(), True),
    StructField("customer_age", IntegerType(), True),
    StructField("transaction_date", StringType(), True)
])

items_schema = StructType([
    StructField("item_id", IntegerType(), True),
    StructField("item_name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("supplier", StringType(), True),
    StructField("price_usd", DoubleType(), True),
    StructField("weight_kg", DoubleType(), True),
    StructField("color", StringType(), True),
    StructField("warranty_years", IntegerType(), True),
    StructField("release_year", IntegerType(), True),
    StructField("rating", DoubleType(), True)
])

cities_schema = StructType([
    StructField("city_id", IntegerType(), True),
    StructField("city_name", StringType(), True),
    StructField("state", StringType(), True),
    StructField("country", StringType(), True),
    StructField("population", IntegerType(), True),
    StructField("area_sq_km", DoubleType(), True),
    StructField("average_income_usd", IntegerType(), True),
    StructField("founded_year", IntegerType(), True),
    StructField("time_zone", StringType(), True),
    StructField("climate", StringType(), True)
])

# Load CSVs
transactions_df = spark.read.csv(
    "/home/user/Documents/retail-inventory/retail-inventory-optimisation/transactions.csv",
    schema=transactions_schema,
    header=True,
    sep=","
)

items_df = spark.read.csv(
    "/home/user/Documents/retail-inventory/retail-inventory-optimisation/items.csv",
    schema=items_schema,
    header=True,
    sep=","
)

cities_df = spark.read.json(  # assuming JSON for cities
    "/home/user/Documents/retail-inventory/retail-inventory-optimisation/cities.json",
    schema=cities_schema,
    multiLine=True
)

# Register Hive External Tables
transactions_df.write.mode("overwrite").option("path", "/home/user/Documents/retail inventory optimisation/write_folder/") \
    .saveAsTable("default.transactions_raw")

items_df.write.mode("overwrite").option("path", "/home/user/Documents/retail inventory optimisation/write_folder/") \
    .saveAsTable("default.items_raw")

cities_df.write.mode("overwrite").option("path", "/home/user/Documents/retail inventory optimisation/write_folder/") \
    .saveAsTable("default.cities_raw")

# Schema Validation Report
print("===== Transactions Schema =====")
transactions_df.printSchema()
transactions_df.describe().show()

print("===== Items Schema =====")
items_df.printSchema()
items_df.describe().show()

print("===== Cities Schema =====")
cities_df.printSchema()
cities_df.describe().show()


25/07/21 12:46:29 WARN Utils: Your hostname, user-virtual-machine resolves to a loopback address: 127.0.1.1; using 10.33.58.47 instead (on interface ens160)
25/07/21 12:46:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/21 12:46:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/21 12:46:35 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/07/21 12:46:35 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
25/07/21 12:46:39 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
25/07/21 12:46:39 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set b

===== Transactions Schema =====
root
 |-- transaction_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- city_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price_usd: double (nullable = true)
 |-- discount_percent: double (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- customer_age: integer (nullable = true)
 |-- transaction_date: string (nullable = true)



25/07/21 12:46:47 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------+------------------+------------------+------------------+------------------+------------------+------------------+--------------+------------+------------------+----------------+
|summary|    transaction_id|           item_id|           city_id|          quantity|         price_usd|  discount_percent|payment_method|order_status|      customer_age|transaction_date|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+--------------+------------+------------------+----------------+
|  count|            224400|            224400|            224400|            219902|            219924|            219927|        224400|      224400|            224400|          224400|
|   mean|109971.21411319073|1251.3774286987523|149.53861408199643|1.9613191330683668|247.33349166075595|14.988835840983576|          NULL|        NULL| 48.49874331550802|            NULL|
| stddev| 63512.13953268465|144.89527851151345|29.0034356720

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CitiesCleaning") \
    .getOrCreate()

df = spark.read.option("multiline", "true").json("cities.json")

25/07/21 12:51:12 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
df.printSchema()
df.show(truncate=False)

root
 |-- area_sq_km: double (nullable = true)
 |-- average_income_usd: long (nullable = true)
 |-- city_id: long (nullable = true)
 |-- city_name: string (nullable = true)
 |-- climate: string (nullable = true)
 |-- country: string (nullable = true)
 |-- founded_year: long (nullable = true)
 |-- population: long (nullable = true)
 |-- state: string (nullable = true)
 |-- time_zone: string (nullable = true)

+----------+------------------+-------+------------------+-----------+-------+------------+----------+-----+------------------+
|area_sq_km|average_income_usd|city_id|city_name         |climate    |country|founded_year|population|state|time_zone         |
+----------+------------------+-------+------------------+-----------+-------+------------+----------+-----+------------------+
|148.55    |38107             |100    |East Anthonyside  |Polar      |Canada |1996        |777286    |AK   |Africa/Bissau     |
|830.61    |41816             |101    |Laurenburgh       |Arid       |Mexico

In [5]:
df.count()

103

In [8]:
from pyspark.sql.functions import *

cleaned_df=df

# Remove duplicate rows based on 'city_id'
cleaned_df = cleaned_df.dropDuplicates(['city_id'])

# Impute missing values in 'average_income_usd' with the mean
# Calculate the mean of 'average_income_usd' (excluding nulls).
mean_income = cleaned_df.agg(mean('average_income_usd')).first()[0]
# Fill all null values in 'average_income_usd' with the calculated mean.
cleaned_df = cleaned_df.fillna({'average_income_usd': mean_income})

# Remove rows where the 'population' is negative
# Ensures all cities have non-negative population counts.
cleaned_df = cleaned_df.filter(col('population') >= 0)

# Standardize 'city_name' by removing leading/trailing spaces
# Ensures city names are cleaned of any unwanted space characters.
cleaned_df = cleaned_df.withColumn('city_name', trim(col('city_name')))

# Filter cities by 'founded_year' to keep only reasonable years (between 1000 and 2025)
# Removes cities with obviously invalid foundation years.
cleaned_df = cleaned_df.filter((col('founded_year') >= 1000) & (col('founded_year') <= 2025))

# Remove rows where 'state' or 'country' is missing (null)
# Ensures all cities have both a state and a country specified.
cleaned_df = cleaned_df.filter(col('state').isNotNull() & col('country').isNotNull())

In [9]:
df.count()

103

In [10]:
cleaned_df.count()

97