In [None]:
import pandas as pd
import random
import string
from datetime import datetime, timedelta

# Number of rows
n = 100000

# Generate id column
ids = list(range(1000, 1000 + n))

# Generate cid column (random integers)
cids = [random.randint(1, n) for _ in range(n)]

# Generate uuid column ensuring uniqueness
uuids = set()
while len(uuids) < n:
    uuid = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(10))
    uuids.add(uuid)
uuids = list(uuids)

# Generate date and time columns
dates = [datetime(2023, 9, (i % 7) + 1) for i in range(n)]
times = [(datetime.min + timedelta(seconds=i % (24*60*60))).time() for i in range(n)]

# Create the dataframe
df = pd.DataFrame({
    'id': ids,
    'cid': cids,
    'uuid': uuids,
    'date': dates,
    'time': times
})
df.head()

In [None]:
# Regenerating the date and time columns

# Number of seconds in a day
seconds_in_day = 24*60*60

# Generate structured dates and times
dates = []
times = []

for i in range(n):
    day = (i // seconds_in_day) % 7 + 1
    current_date = datetime(2023, 9, day)
    current_time = (datetime.min + timedelta(seconds=i % seconds_in_day)).time()
    
    dates.append(current_date)
    times.append(current_time.strftime('%I:%M:%S %p'))

df['date'] = dates
df['time'] = times

# Saving the adjusted dataframe to a CSV file
file_path_adjusted = "/mnt/data/dataset_adjusted.csv"
df.to_csv(file_path_adjusted, index=False)

df.head()


In [None]:
# Generate the correct dataset with the desired date range

# Number of days
days = 8

# Total rows needed
total_rows = days * seconds_in_day

# Generate id column
ids = list(range(1000, 1000 + total_rows))

# Generate cid column (random integers)
cids = [random.randint(1, total_rows) for _ in range(total_rows)]

# Generate unique uuids
uuids_set = set()
while len(uuids_set) < total_rows:
    uuid = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(10))
    uuids_set.add(uuid)
uuids = list(uuids_set)

# Generate structured dates and times
dates = [start_date + timedelta(days=(i // seconds_in_day)) for i in range(total_rows)]
times = [(datetime.min + timedelta(seconds=i % seconds_in_day)).time().strftime('%I:%M:%S %p') for i in range(total_rows)]

# Create the corrected dataframe
df_corrected = pd.DataFrame({
    'id': ids,
    'cid': cids,
    'uuid': uuids,
    'date': dates,
    'time': times
})

# Adjusting the date format to MM/DD/YYYY
df_corrected['date'] = df_corrected['date'].dt.strftime('%m/%d/%Y')

# Save the corrected dataframe
file_path_corrected_final = "/mnt/data/dataset_corrected_final.csv"
df_corrected.to_csv(file_path_corrected_final, index=False)

df_corrected['date'].unique()  # Displaying unique dates to verify the range
