In [5]:
1.#Generating a dataset
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Define customer names
customers = ['Amazon', 'Walmart', 'Target', 'Costco', 'BestBuy', 'eBay']

# Set start date and prepare data list
start_date = datetime(2025, 4, 1)
data = []

# Generate 60 days of sales data
record_id = 1
for day_offset in range(60):
    date = start_date + timedelta(days=day_offset)
    for _ in range(random.randint(3, 6)):  # 3–6 sales per day
        sale = {
            'id': record_id,
            'customer': random.choice(customers),
            'date': date.date().isoformat(),
            'amount': random.randint(100, 2000),
            'last_updated': (date + timedelta(hours=random.randint(0, 23),
                                              minutes=random.randint(0, 59))).isoformat()
        }
        data.append(sale)
        record_id += 1

# Create DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv('custom_data.csv', index=False)

# Display first few rows
df.head()



Unnamed: 0,id,customer,date,amount,last_updated
0,1,Walmart,2025-04-01,988,2025-04-01T11:14:00
1,2,eBay,2025-04-01,1509,2025-04-01T02:10:00
2,3,Walmart,2025-04-01,776,2025-04-01T06:19:00
3,4,Target,2025-04-01,1385,2025-04-01T11:26:00
4,5,eBay,2025-04-02,1010,2025-04-02T14:50:00


In [6]:
# FULL EXTRACTION
df_full = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])
print(f"Pulled {len(df_full)} rows via full extraction.")
df_full.head()


Pulled 282 rows via full extraction.


Unnamed: 0,id,customer,date,amount,last_updated
0,1,Walmart,2025-04-01,988,2025-04-01 11:14:00
1,2,eBay,2025-04-01,1509,2025-04-01 02:10:00
2,3,Walmart,2025-04-01,776,2025-04-01 06:19:00
3,4,Target,2025-04-01,1385,2025-04-01 11:26:00
4,5,eBay,2025-04-02,1010,2025-04-02 14:50:00


In [9]:
# Set initial last extraction time (e.g., halfway through the data range)
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-20 12:00:00") 


In [10]:
# INCREMENTAL EXTRACTION
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()
df = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])
last_extraction_time = pd.to_datetime(last_extraction)
df_incremental = df[df['last_updated'] > last_extraction_time]
print(f"Pulled {len(df_incremental)} new/updated rows since {last_extraction}.")
df_incremental.head()


Pulled 197 new/updated rows since 2025-04-20 12:00:00.


Unnamed: 0,id,customer,date,amount,last_updated
85,86,eBay,2025-04-21,1126,2025-04-21 04:50:00
86,87,Costco,2025-04-21,1630,2025-04-21 17:07:00
87,88,BestBuy,2025-04-21,740,2025-04-21 02:40:00
88,89,eBay,2025-04-21,935,2025-04-21 02:14:00
89,90,Walmart,2025-04-21,1844,2025-04-21 20:01:00


In [None]:
#Saving new timestamp
# Getting the most recent update
new_checkpoint = df['last_updated'].max()
# Saving it it
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())
print(f"Updated last_extraction.txt to {new_checkpoint}")

Updated last_extraction.txt to 2025-05-30 23:52:00
