In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

In [2]:
# Simulate 60 days of data
customers = ['Amazon', 'Walmart', 'Target', 'Costco', 'BestBuy', 'eBay']
data = []
start_date = datetime(2025, 4, 1)
for i in range(1, 61):
    date = start_date + timedelta(days=i)
    for _ in range(random.randint(3, 6)):  # 3–6 sales per day
        data.append({
            'id': random.randint(1000, 9999),
            'customer': random.choice(customers),
            'date': date.date().isoformat(),
            'amount': random.randint(100, 2000),
            'last_updated': (date + timedelta(hours=random.randint(0, 23),
                                              minutes=random.randint(0, 59))).isoformat()
        })

In [3]:
df = pd.DataFrame(data)
df.to_csv('sales_data_large.csv', index=False)
df.head()

Unnamed: 0,id,customer,date,amount,last_updated
0,3353,Amazon,2025-04-02,945,2025-04-02T22:16:00
1,4869,Amazon,2025-04-02,1311,2025-04-02T14:25:00
2,6722,eBay,2025-04-02,632,2025-04-02T18:09:00
3,9443,Walmart,2025-04-03,587,2025-04-03T23:53:00
4,8837,BestBuy,2025-04-03,1919,2025-04-03T08:40:00


In [4]:
# FULL EXTRACTION
df_full = pd.read_csv("sales_data_large.csv", parse_dates=["last_updated"])
print(f"Pulled {len(df_full)} rows via full extraction.")
df_full.head()

Pulled 270 rows via full extraction.


Unnamed: 0,id,customer,date,amount,last_updated
0,3353,Amazon,2025-04-02,945,2025-04-02 22:16:00
1,4869,Amazon,2025-04-02,1311,2025-04-02 14:25:00
2,6722,eBay,2025-04-02,632,2025-04-02 18:09:00
3,9443,Walmart,2025-04-03,587,2025-04-03 23:53:00
4,8837,BestBuy,2025-04-03,1919,2025-04-03 08:40:00


In [5]:
# Set initial last extraction time (e.g., halfway through the data range)
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-20 12:00:00") 

In [6]:
# INCREMENTAL EXTRACTION
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()
df = pd.read_csv("sales_data_large.csv", parse_dates=["last_updated"])
last_extraction_time = pd.to_datetime(last_extraction)
df_incremental = df[df['last_updated'] > last_extraction_time]
print(f"Pulled {len(df_incremental)} new/updated rows since {last_extraction}.")
df_incremental.head()

Pulled 190 new/updated rows since 2025-04-20 12:00:00.


Unnamed: 0,id,customer,date,amount,last_updated
77,5065,Amazon,2025-04-20,292,2025-04-20 20:15:00
78,7686,Amazon,2025-04-20,1713,2025-04-20 12:10:00
81,2322,Target,2025-04-20,1737,2025-04-20 22:23:00
83,6527,eBay,2025-04-21,1259,2025-04-21 20:52:00
84,5201,Walmart,2025-04-21,767,2025-04-21 21:29:00


In [7]:
# Get the most recent update
new_checkpoint = df['last_updated'].max()

In [8]:
# Save it
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())

In [9]:
print(f"Updated last_extraction.txt to {new_checkpoint}")

Updated last_extraction.txt to 2025-05-31 18:35:00
