In [5]:
import pandas as pd

df = pd.read_csv("C:/Users/arpit/OneDrive/Desktop/cloud-cost-leakage/data/raw/cost_interpreter.csv")
df.head()


Unnamed: 0,UsageDate,ServiceName,CostUSD,Cost,Currency
0,01-05-2023,Automation,0.001,0.081771,INR
1,01-05-2023,Azure DNS,0.500348,40.914076,INR
2,01-05-2023,Bandwidth,127.770719,10447.97139,INR
3,01-05-2023,Storage,134.504502,10998.60124,INR
4,01-05-2023,Virtual Machines,13.987309,1143.759767,INR


In [8]:
df = df.rename(columns={
    "UsageDate": "date",
    "ServiceName": "service_name",
    "Cost": "cost"
})

df["date"] = pd.to_datetime(df["date"], dayfirst=True)

df = df[["date", "service_name", "cost"]]
df.head()

Unnamed: 0,date,service_name,cost
0,2023-01-05,Automation,0.081771
1,2023-01-05,Azure DNS,40.914076
2,2023-01-05,Bandwidth,10447.97139
3,2023-01-05,Storage,10998.60124
4,2023-01-05,Virtual Machines,1143.759767


In [9]:
import numpy as np
np.random.seed(42)

df["resource_id"] = (
    df["service_name"]
    .str.lower()
    .str.replace(" ", "-") + "-" +
    np.random.randint(1000, 9999, size=len(df)).astype(str)
)


In [10]:
df["region"] = np.random.choice(
    ["us-east-1", "asia-south-1", "eu-west-1"],
    size=len(df)
)
df.head()

Unnamed: 0,date,service_name,cost,resource_id,region
0,2023-01-05,Automation,0.081771,automation-8270,asia-south-1
1,2023-01-05,Azure DNS,40.914076,azure-dns-1860,eu-west-1
2,2023-01-05,Bandwidth,10447.97139,bandwidth-6390,asia-south-1
3,2023-01-05,Storage,10998.60124,storage-6191,us-east-1
4,2023-01-05,Virtual Machines,1143.759767,virtual-machines-6734,eu-west-1


In [11]:
usage_scale = {
    "Virtual Machines": (10, 100),   # CPU hours
    "Storage": (50, 800),            # GB
    "Bandwidth": (20, 500),           # GB transferred
    "Azure DNS": (1, 50),
    "Automation": (5, 60)
}

def gen_usage(service):
    return np.random.uniform(*usage_scale.get(service, (10, 100)))

df["usage_amount"] = df["service_name"].apply(gen_usage)
df.head()

Unnamed: 0,date,service_name,cost,resource_id,region,usage_amount
0,2023-01-05,Automation,0.081771,automation-8270,asia-south-1,22.776161
1,2023-01-05,Azure DNS,40.914076,azure-dns-1860,eu-west-1,26.42074
2,2023-01-05,Bandwidth,10447.97139,bandwidth-6390,asia-south-1,357.4491
3,2023-01-05,Storage,10998.60124,storage-6191,us-east-1,322.722202
4,2023-01-05,Virtual Machines,1143.759767,virtual-machines-6734,eu-west-1,97.460387


In [12]:
spike_idx = df.sample(frac=0.03).index
df.loc[spike_idx, "cost"] *= np.random.uniform(2.5, 4.0)
df.head()

Unnamed: 0,date,service_name,cost,resource_id,region,usage_amount
0,2023-01-05,Automation,0.081771,automation-8270,asia-south-1,22.776161
1,2023-01-05,Azure DNS,40.914076,azure-dns-1860,eu-west-1,26.42074
2,2023-01-05,Bandwidth,10447.97139,bandwidth-6390,asia-south-1,357.4491
3,2023-01-05,Storage,10998.60124,storage-6191,us-east-1,322.722202
4,2023-01-05,Virtual Machines,4543.884576,virtual-machines-6734,eu-west-1,97.460387


In [13]:
idle_idx = df.sample(frac=0.05).index
df.loc[idle_idx, "usage_amount"] *= 0.1
df.head()

Unnamed: 0,date,service_name,cost,resource_id,region,usage_amount
0,2023-01-05,Automation,0.081771,automation-8270,asia-south-1,22.776161
1,2023-01-05,Azure DNS,40.914076,azure-dns-1860,eu-west-1,26.42074
2,2023-01-05,Bandwidth,10447.97139,bandwidth-6390,asia-south-1,357.4491
3,2023-01-05,Storage,10998.60124,storage-6191,us-east-1,322.722202
4,2023-01-05,Virtual Machines,4543.884576,virtual-machines-6734,eu-west-1,97.460387


In [16]:
df.to_csv("C:/Users/arpit/OneDrive/Desktop/cloud-cost-leakage/data/processed/cloud_billing_synthetic.csv", index=False)
print("Cloud billing synthetic dataset created!")
df.head()

Cloud billing synthetic dataset created!


Unnamed: 0,date,service_name,cost,resource_id,region,usage_amount
0,2023-01-05,Automation,0.081771,automation-8270,asia-south-1,22.776161
1,2023-01-05,Azure DNS,40.914076,azure-dns-1860,eu-west-1,26.42074
2,2023-01-05,Bandwidth,10447.97139,bandwidth-6390,asia-south-1,357.4491
3,2023-01-05,Storage,10998.60124,storage-6191,us-east-1,322.722202
4,2023-01-05,Virtual Machines,4543.884576,virtual-machines-6734,eu-west-1,97.460387


In [None]:
df.groupby("date")["cost"].sum().plot(title="Daily Cloud Cost")
