In [4]:
import pandas as pd
import numpy as np
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import random

#simulate 60 days of sales data
customers = ["Amazon", "Walmart", "Target", "Best Buy", "Costco"]
data = []

#for loop to generate data
start_date = datetime.date(2025, 4, 1)
for i in range(60):
    date = start_date + datetime.timedelta(days=i)
    for j in range(random.randint(3, 6)):  # 3 to 6 transactions per day
        transaction_time = (
            datetime.datetime.combine(date, datetime.time()) + 
            datetime.timedelta(hours=random.randint(0,23), minutes=random.randint(0,59), seconds=random.randint(0,59))
        )
        data.append({
            "id": random.randint(1000, 9999),
            "customer": random.choice(customers),
            "date": date.isoformat(),
            "amount": round(random.uniform(100, 2000), 2),
            "last updated": transaction_time.isoformat()
        })

df = pd.DataFrame(data)
df.to_csv("sales_data.csv", index=False)
df.head()

Unnamed: 0,id,customer,date,amount,last updated
0,2062,Walmart,2025-04-01,1012.29,2025-04-01T05:09:53
1,4322,Costco,2025-04-01,1726.03,2025-04-01T13:11:25
2,4489,Target,2025-04-01,674.67,2025-04-01T22:48:08
3,7652,Amazon,2025-04-01,1838.12,2025-04-01T07:15:41
4,2767,Costco,2025-04-02,315.87,2025-04-02T10:50:55


In [6]:
#full data extraction from a realistic sales data
df_full = pd.read_csv("sales_data.csv", parse_dates=["date", "last updated"])
print("Full Data Extracted:")
df_full.head()

Full Data Extracted:


Unnamed: 0,id,customer,date,amount,last updated
0,2062,Walmart,2025-04-01,1012.29,2025-04-01 05:09:53
1,4322,Costco,2025-04-01,1726.03,2025-04-01 13:11:25
2,4489,Target,2025-04-01,674.67,2025-04-01 22:48:08
3,7652,Amazon,2025-04-01,1838.12,2025-04-01 07:15:41
4,2767,Costco,2025-04-02,315.87,2025-04-02 10:50:55


In [7]:
#increment data extraction
#simulate last extraction date
last_extraction_date = datetime.datetime(2025, 4, 15, 12, 0, 0)  # April 15, 2025 at noon
df_incremental = df_full[df_full["last updated"] > last_extraction_date]
print(f"Incremental Data Extracted after {last_extraction_date}:")
df_incremental.head()

Incremental Data Extracted after 2025-04-15 12:00:00:


Unnamed: 0,id,customer,date,amount,last updated
69,2492,Target,2025-04-15,1848.73,2025-04-15 23:19:04
70,6620,Walmart,2025-04-15,607.16,2025-04-15 17:54:42
71,5959,Costco,2025-04-15,770.24,2025-04-15 14:25:57
72,4191,Amazon,2025-04-15,1671.19,2025-04-15 16:01:47
75,4698,Target,2025-04-16,207.27,2025-04-16 08:03:42
