In [1]:
#Read the raw Amazon JSON and M5 Excel files, convert to lightweight CSVs used by the project:

In [2]:
#Imports and paths 
import json, os
import pandas as pd
from pathlib import Path

BASE = Path(r"D:\CAPSTONE_FINAL")
RAW_AMAZON = BASE / "amazon_reviews" / "reviews_Electronics_5.json"
M5_DIR = BASE / "m5_forecasting_accuracy"
OUT_DIR = BASE / "data"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("Paths set:")
print("RAW_AMAZON:", RAW_AMAZON)
print("M5_DIR:", M5_DIR)
print("OUT_DIR:", OUT_DIR)


Paths set:
RAW_AMAZON: D:\CAPSTONE_FINAL\amazon_reviews\reviews_Electronics_5.json
M5_DIR: D:\CAPSTONE_FINAL\m5_forecasting_accuracy
OUT_DIR: D:\CAPSTONE_FINAL\data


In [3]:
#read Amazon JSON lines & make CSV subset

In [6]:
# load a manageable Amazon subset (first 200k lines or less)
rows = []
max_lines = 200000  
count = 0
with open(RAW_AMAZON, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= max_lines:
            break
        data = json.loads(line)
        rows.append({
            "user_id": data.get("reviewerID"),
            "item_id": data.get("asin"),
            "rating": data.get("overall"),
            "review_text": data.get("reviewText"),
            "summary": data.get("summary"),
            "timestamp": data.get("unixReviewTime")
        })
        count += 1
print(f"Loaded {count} amazon review lines")
amazon_df = pd.DataFrame(rows)
amazon_df.head(3)


Loaded 200000 amazon review lines


Unnamed: 0,user_id,item_id,rating,review_text,summary,timestamp
0,AO94DHGC771SJ,528881469,5.0,We got this GPS for my husband who is an (OTR)...,Gotta have GPS!,1370131200
1,AMO214LNFCEI4,528881469,1.0,"I'm a professional OTR truck driver, and I bou...",Very Disappointed,1290643200
2,A3N7T0DY83Y4IG,528881469,3.0,"Well, what can I say. I've had this unit in m...",1st impression,1283990400


In [7]:
#Save Amazon subset to CSV
out_amazon = OUT_DIR / "reviews.csv"
amazon_df.to_csv(out_amazon, index=False)
print(f"Saved Amazon subset -> {out_amazon} ; shape = {amazon_df.shape}")


Saved Amazon subset -> D:\CAPSTONE_FINAL\data\reviews.csv ; shape = (200000, 6)


In [9]:
# Read M5 files (sales_train_validation, calendar, sell_prices)
import pandas as pd

sales_path = M5_DIR / "sales_train_validation.csv"
calendar_path = M5_DIR / "calendar.csv"
sell_prices_path = M5_DIR / "sell_prices.csv"

print("Reading M5 CSV files (this may take a few seconds)...")
sales = pd.read_csv(sales_path)
calendar = pd.read_csv(calendar_path)
sell_prices = pd.read_csv(sell_prices_path)
print("Loaded shapes:", sales.shape, calendar.shape, sell_prices.shape)
sales.head(3)



Reading M5 CSV files (this may take a few seconds)...
Loaded shapes: (30490, 1919) (1969, 14) (6841121, 4)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1


In [10]:
# convert wide sales -> long and attach dates (melt sales to long + merge calendar)
id_cols = ["id","item_id","dept_id","cat_id","store_id","state_id"]
sales_long = sales.melt(id_vars=id_cols, var_name="d", value_name="units_sold")
calendar_small = calendar[["d","date"]]
sales_long = sales_long.merge(calendar_small, on="d", how="left")
print("sales_long shape:", sales_long.shape)
sales_long.head(3)

sales_long shape: (58327370, 9)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,units_sold,date
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29


In [11]:
# Save M5 long CSV (inventory history)
out_m5 = OUT_DIR / "inventory_history.csv"
sales_long.to_csv(out_m5, index=False)
print(f"Saved M5 long -> {out_m5} ; shape = {sales_long.shape}")


Saved M5 long -> D:\CAPSTONE_FINAL\data\inventory_history.csv ; shape = (58327370, 9)


In [12]:
# Quick sanity checks on saved files
print("reviews.csv:", pd.read_csv(OUT_DIR / "reviews.csv", nrows=2).shape)
print("inventory_history.csv (sample):")
display(pd.read_csv(OUT_DIR / "inventory_history.csv", nrows=5))


reviews.csv: (2, 6)
inventory_history.csv (sample):


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,units_sold,date
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29
