In [8]:
!pip install -q kagglehub

import kagglehub
import os
import pandas as pd

raw_data_path = "/content/data/raw"
os.makedirs(raw_data_path, exist_ok=True)
os.environ["KAGGLEHUB_CACHE"] = raw_data_path


dataset_path = kagglehub.dataset_download("govindaramsriram/energy-consumption-dataset-linear-regression")

import os
csv_files = []
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.endswith(".csv"):
            csv_files.append(os.path.join(root, file))

print("Archivos encontrados:")
for f in csv_files:
    print(f)


Archivos encontrados:
/content/data/raw\datasets\govindaramsriram\energy-consumption-dataset-linear-regression\versions\1\test_energy_data.csv
/content/data/raw\datasets\govindaramsriram\energy-consumption-dataset-linear-regression\versions\1\train_energy_data.csv


In [9]:
df = pd.read_csv([f for f in csv_files if "train_energy_data.csv" in f][0])
print(f"Shape: {df.shape}")
df.head()

Shape: (1000, 7)


Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week,Energy Consumption
0,Residential,7063,76,10,29.84,Weekday,2713.95
1,Commercial,44372,66,45,16.72,Weekday,5744.99
2,Industrial,19255,37,17,14.3,Weekend,4101.24
3,Residential,13265,14,41,32.82,Weekday,3009.14
4,Commercial,13375,26,18,11.92,Weekday,3279.17


In [10]:
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

print("Columnas categóricas detectadas:")
print(cat_cols)


Columnas categóricas detectadas:
['Building Type', 'Day of Week']


In [11]:
for c in ['Building Type', 'Day of Week']:
    df[c] = df[c].astype(str).str.strip().str.title()

print("Building Type:", sorted(df['Building Type'].unique()))
print("Day of Week:", sorted(df['Day of Week'].unique()))

Building Type: ['Commercial', 'Industrial', 'Residential']
Day of Week: ['Weekday', 'Weekend']


In [12]:
cat_cols = ['Building Type', 'Day of Week']

df_encoded = pd.get_dummies(
    df,
    columns=cat_cols,
    drop_first=False,
    dummy_na=False
)

print("Shape después de One-Hot:", df_encoded.shape)
[d for d in df_encoded.columns if d.startswith('Building Type_') or d.startswith('Day of Week_')]

Shape después de One-Hot: (1000, 10)


['Building Type_Commercial',
 'Building Type_Industrial',
 'Building Type_Residential',
 'Day of Week_Weekday',
 'Day of Week_Weekend']

In [13]:
print("Shape:", df_encoded.shape)

cols = df_encoded.columns.tolist()
print("Total columnas:", len(cols))
cols
df_encoded.head(10)


Shape: (1000, 10)
Total columnas: 10


Unnamed: 0,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Energy Consumption,Building Type_Commercial,Building Type_Industrial,Building Type_Residential,Day of Week_Weekday,Day of Week_Weekend
0,7063,76,10,29.84,2713.95,False,False,True,True,False
1,44372,66,45,16.72,5744.99,True,False,False,True,False
2,19255,37,17,14.3,4101.24,False,True,False,False,True
3,13265,14,41,32.82,3009.14,False,False,True,True,False
4,13375,26,18,11.92,3279.17,True,False,False,True,False
5,37377,26,32,16.24,4687.67,True,False,False,False,True
6,38638,92,14,21.01,5526.83,False,True,False,False,True
7,34950,60,18,28.24,4116.32,False,False,True,True,False
8,29741,99,44,13.08,5841.65,False,True,False,True,False
9,17467,42,36,28.84,3419.13,False,False,True,True,False


In [14]:
output_path = "../data/processed/energy_data_processed_test.csv"
df_encoded.to_csv(output_path, index=False)
print(f"CSV guardado en: {output_path}")

CSV guardado en: ../data/processed/energy_data_processed_test.csv
