In [1]:
using DataFrames, Random, Distributions, CSV

Random.seed!(42)

# Generate a DataFrame with 360 rows (4 Summers)
n = 360
df = DataFrame(ID = 1:n)

# Generate normally distributed temperature data around 25 degrees Celsius
df[!, "Temperature"] = 25 .+ randn(n) .* 2

# Generate Is_Weekend boolean data
is_weekend_long = repeat([false, false, false, false, false, true, true], Int(ceil(n / 7)))
df[!, "Is_Weekend"] = is_weekend_long[1:n]

# Generate dependent hours open data. If it's the weekend, the store is open for 10-11 hours. If it's a weekday, the store is open for 8-10 hours
df[!, "Hours_Open"] = ifelse.(df[!, "Is_Weekend"], rand(10:11, n), rand(8:10, n))

# Now we need to generate electricity usage (based on temperature and hours open) and ice cream sales (based on temperature, hours open, and whether it's the weekend)
# But they need to be correlated with each other withoud being dependent on each other

# Generate electricity usage data based on temperature and hours open
df[!, "Electricity_Usage"] = 10 .+ (2 .+ randn(n) .* 0.2) .* df[!, "Temperature"] .+ (3 .+ randn(n) .* 0.5) .* df[!, "Hours_Open"]

# Generate ice cream sales data based on temperature, hours open, and whether it's the weekend
df[!, "Ice_Cream_Sales"] = round.(Int, 20 .+ 30 .* df[!, "Temperature"] .+ (25 .+ randn(n) .* 2) .* df[!, "Hours_Open"] .+ (100 .+ randn(n) .* 10) .* df[!, "Is_Weekend"]);

In [2]:
# Write the DataFrame to a CSV file
CSV.write("data/ice_cream_sales.csv", df);