In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import pandas as pd
from pathlib import Path

# --------- config ---------
INPUT_PATH  = Path("/content/drive/Shareddrives/CS224W/data/machine_meta.csv")
OUTPUT_PATH = Path("/content/drive/Shareddrives/CS224W/data_clean/machine_meta_cleaned.csv")
# --------------------------

OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

colnames = [
    "machine_id","time_stamp","failure_domain_1","failure_domain_2",
    "cpu_num","mem_size","status"
]

df_raw = pd.read_csv(INPUT_PATH, sep=",", header=None, names=colnames)

# Keep only required columns
keep = ["machine_id", "cpu_num", "mem_size"]
df = df_raw.loc[:, keep].copy()

# Clean types and drop bad rows
df["cpu_num"] = pd.to_numeric(df["cpu_num"], errors="coerce")
df["mem_size"] = pd.to_numeric(df["mem_size"], errors="coerce")
df = df.dropna(subset=["machine_id", "cpu_num", "mem_size"])

df = df.drop_duplicates()

# Natural sort by numeric part of machine_id (e.g., m_1, m_99, m_100)
# If IDs are like "m_123", this works; if formats vary, you can tweak the regex.
num = pd.to_numeric(df["machine_id"].str.extract(r"(\d+)")[0], errors="coerce")
prefix = df["machine_id"].str.extract(r"^([^\d]+)")[0]

df_sorted = df.assign(_prefix=prefix, _num=num).sort_values(
    by=["_prefix", "_num"], kind="mergesort"
).drop(columns=["_prefix", "_num"]).reset_index(drop=True)

# Save ONLY the three requested columns
df_sorted[keep].to_csv(OUTPUT_PATH, index=False)

print("Saved:", OUTPUT_PATH)
print("Rows:", len(df_sorted))
print(df_sorted[keep].head())

Saved: /content/drive/Shareddrives/CS224W/data_clean/machine_meta_cleaned.csv
Rows: 4034
  machine_id  cpu_num  mem_size
0        m_1       96       100
1        m_2       96       100
2        m_3       96       100
3        m_4       96       100
4        m_5       96       100


In [6]:
colnames = [
    "task_name ","instance_num","job_name","task_type",
    "status","start_time","end_time" ,"plan_cpu", "plan_mem"
]
df_raw = pd.read_csv("/content/drive/Shareddrives/CS224W/data/batch_task.csv", sep=",", names=colnames)
df_raw.head()

job_num = pd.to_numeric(
    df_raw["job_name"].str.extract(r'^j_(\d+)$', expand=False),
    errors="coerce"
)
filtered = df_raw.loc[job_num.le(200000).fillna(False)].copy()
print(filtered.head())
print("Kept rows:", len(filtered))

all_term_by_job = filtered.groupby("job_name")["status"].transform(lambda s: (s == "Terminated").all())
df = filtered[all_term_by_job].copy()

# Finally, keep only rows that are "Terminated" (redundant but explicit)
df = df[df["status"] == "Terminated"].copy()

print(df.head())
print("Remaining jobs:", df["job_name"].nunique(), "  Rows:", len(df))

OUTPUT_PATH = Path("/content/drive/Shareddrives/CS224W/data_clean/batch_task_cleaned.csv")
df.to_csv(OUTPUT_PATH, index=False)

  task_name   instance_num job_name  task_type      status  start_time  \
0         M1           1.0      j_1          1  Terminated      419912   
1       R2_1           1.0      j_2          1  Terminated       87076   
2         M1           1.0      j_2          1  Terminated       87076   
3       R6_3         371.0      j_3          1  Terminated      157297   
4     J4_2_3        1111.0      j_3          1  Terminated      157329   

   end_time  plan_cpu  plan_mem  
0    419912     100.0      0.20  
1     87086      50.0      0.20  
2     87083      50.0      0.20  
3    157325     100.0      0.49  
4    157376     100.0      0.59  
Kept rows: 681020
  task_name   instance_num job_name  task_type      status  start_time  \
0         M1           1.0      j_1          1  Terminated      419912   
1       R2_1           1.0      j_2          1  Terminated       87076   
2         M1           1.0      j_2          1  Terminated       87076   
3       R6_3         371.0      j_3  

In [7]:
filtered = df_raw.loc[job_num.le(1000).fillna(False)].copy()
print(filtered.head())
print("Kept rows:", len(filtered))

all_term_by_job = filtered.groupby("job_name")["status"].transform(lambda s: (s == "Terminated").all())
df = filtered[all_term_by_job].copy()

# Finally, keep only rows that are "Terminated" (redundant but explicit)
df = df[df["status"] == "Terminated"].copy()

print(df.head())
print("Remaining jobs:", df["job_name"].nunique(), "  Rows:", len(df))

OUTPUT_PATH = Path("/content/drive/Shareddrives/CS224W/data_clean/batch_task_mini.csv")
df.to_csv(OUTPUT_PATH, index=False)

  task_name   instance_num job_name  task_type      status  start_time  \
0         M1           1.0      j_1          1  Terminated      419912   
1       R2_1           1.0      j_2          1  Terminated       87076   
2         M1           1.0      j_2          1  Terminated       87076   
3       R6_3         371.0      j_3          1  Terminated      157297   
4     J4_2_3        1111.0      j_3          1  Terminated      157329   

   end_time  plan_cpu  plan_mem  
0    419912     100.0      0.20  
1     87086      50.0      0.20  
2     87083      50.0      0.20  
3    157325     100.0      0.49  
4    157376     100.0      0.59  
Kept rows: 3492
  task_name   instance_num job_name  task_type      status  start_time  \
0         M1           1.0      j_1          1  Terminated      419912   
1       R2_1           1.0      j_2          1  Terminated       87076   
2         M1           1.0      j_2          1  Terminated       87076   
3       R6_3         371.0      j_3    

In [19]:
!grep -E ",j_($(seq -s '|' 1 1000))," /content/drive/Shareddrives/CS224W/data_clean/batch_instance_cleaned.csv > /content/drive/Shareddrives/CS224W/data_clean/batch_instance_mini.csv

In [None]:
# auth once on Colab
!gcloud auth login




Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=fEFSFUhVq9rjvbCErr5kfArAp63pQo&prompt=consent&token_usage=remote&access_type=offline&code_challenge=lq8ambXU87sJChzcyWJuIz6Is2xhYCmI73jlIVfio4s&code_challenge_method=S256

Once finished, enter the verification code provided in your browser: 4/0Ab32j92--DeFoM573KflRu7bWrLLO3HW4nzn50tce7MaMy1tULnX0Yq6mVW0pMWqZ2Lyag

You are now logged in as [q1wu@stanford.edu].
Your current project is [

In [None]:
!gcloud config set project cs224w-gnn

Updated property [core/project].


In [None]:
!gsutil -m cp gs://cs224w-gnn-qi-20251103/alibaba2018/batch_instance_cleaned.csv \
  /content/drive/Shareddrives/CS224W/data_clean

Copying gs://cs224w-gnn-qi-20251103/alibaba2018/batch_instance_cleaned.csv...
/ [0/1 files][    0.0 B/  5.0 GiB]   0% Done                                    ==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

| [1/1 files][  5.0 GiB/  5.0 GiB] 100% Done  56.3 MiB/s ETA 00:00:00           
Operation completed over 1 objects/5.0 GiB.                                      


In [11]:
import pandas as pd
# Path to the input CSV
input_path = "/content/drive/Shareddrives/CS224W/data_clean/batch_instance_mini.csv"

# Load CSV (no header)
df = pd.read_csv(input_path)

df.head(100000)

Unnamed: 0,ins_466123429,M1,j_42,1,Terminated,444527,444529,m_2212,1.1,1.2,60,100,0.04,0.06
0,ins_634167276,MergeTask,j_432,1,Terminated,204048,204049,m_1230,1,1,46,46,0.02,0.02
1,ins_151472804,J4_2_3,j_3,1,Terminated,157366,157369,m_3392,1,1,102,118,0.07,0.07
2,ins_516457701,M3,j_3,1,Terminated,157280,157284,m_1269,1,1,44,74,0.06,0.06
3,ins_572265193,M3,j_3,1,Terminated,157217,157219,m_3036,1,1,61,77,0.03,0.03
4,ins_1306913861,M3,j_3,1,Terminated,157257,157261,m_2479,1,1,37,77,0.05,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64964,ins_576795986,R16_15,j_262,1,Terminated,388187,388188,m_2628,1,1,76,76,0.02,0.02
64965,ins_1338238823,M3,j_262,1,Terminated,388144,388144,m_139,1,1,70,70,0.02,0.02
64966,ins_455484693,J15_10_13_14,j_262,1,Terminated,388181,388182,m_2748,1,1,75,80,0.03,0.03
64967,ins_119183788,R12_11,j_262,1,Terminated,388149,388150,m_143,1,1,84,84,0.02,0.02
