In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import re 
import json
import xgboost as xgb
from sklearn.model_selection import KFold




In [2]:
def preprocess(df):
    # Replace single quotes with double quotes in "random load mesures" column
    df["random load mesures"] = df["random load mesures"].str.replace("'",'"')
    
    # Remove prefix from building and file columns and convert to int
    df['building'] = df['building'].str.replace("Building_","").astype(int)
    df['File'] = df['File'].str.replace("Shape0_","").astype(int)
    
    # Extract WWR feature using regular expression
    pattern = r'\d+\.\d+'
    wwr = df['WWR'].apply(lambda x: max([float(m) for m in re.findall(pattern, x)]))
    df['WWR'] = wwr
    
    # Extract Cooling and Lights features using JSON
    features = df['random load mesures'].apply(lambda x: json.loads(x))
    cooling = features.apply(lambda x: float(x['Cooling'].replace(":C","")))
    lights = features.apply(lambda x: float(x['Lights'].replace(":C","")))
    df['Coolings'] = cooling
    df['Lights'] = lights
    # Load the dataset into a pandas DataFrame

    # Create a new feature: Overall Energy Consumption
    df['Overall Energy Consumption'] = df['Operating Hours'] * df['EUI']

    # Create a new feature: Impact of Lighting on Heat Load
    df['Lighting Impact on Heat Load'] = df['Lights'] * df['Light Heat Gain']

    # Create a new feature: Energy Efficiency in relation to Building Size
    df['Energy Efficiency'] = df['EUI'] / df['Number of Floors']

    # Create a new feature: Rate of Heat Addition through Lighting
    df['Lighting Heat Addition Rate'] = df['Light Heat Gain'] / df['Operating Hours']

    # Create a new feature: Logarithm of EUI
    df['Logarithm EUI'] = np.log(df['EUI'])

    # Create a new feature: Building Size Categories
    bins = [0, 5, 10, np.inf]
    #labels = ['Small', 'Medium', 'Large']
    #%df['Building Size Category'] = pd.cut(df['Number of Floors'], bins=bins, labels=labels)

    # Create interaction features
    interaction_features = ['Overall Energy Consumption', 'Lighting Impact on Heat Load', 'Energy Efficiency']
    for feature1 in interaction_features:
        for feature2 in interaction_features:
            if feature1 != feature2:
                interaction_feature_name = f'{feature1} x {feature2}'
                df[interaction_feature_name] = df[feature1] * df[feature2]

    # Save the updated dataset
    #df.to_csv('improved_dataset.csv', index=False)

    # Drop unnecessary columns
    df = df.drop(['random load mesures', 'Permeability','File','building'], axis=1)
    return df


In [3]:
train = pd.read_csv("/kaggle/input/traintrain/Train.csv")
test = pd.read_csv("/kaggle/input/testing/Test (1).csv")

# Preprocess train and test data
train = preprocess(train)
test = preprocess(test)



In [4]:
from catboost import CatBoostRegressor


catboost_model = CatBoostRegressor()

X = train.drop(['Operational Energy'], axis=1)
y = train['Operational Energy']
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize the list to store the mean squared errors (MSEs)
mse_list = []

In [5]:
for train_index, val_index in kf.split(X):
    
    # Split the data into training and validation sets
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
    
    # Scale the data using StandardScaler
    scaler = StandardScaler()
    X_train_fold_scaled = scaler.fit_transform(X_train_fold)
    X_val_fold_scaled = scaler.transform(X_val_fold)
    
    # Apply PCA to reduce dimensionality
    pca = PCA(n_components=10)
    X_train_fold_pca = pca.fit_transform(X_train_fold_scaled)
    X_val_fold_pca = pca.transform(X_val_fold_scaled)
    
    # Train the model
    catboost_model.fit(X_train_fold_pca, y_train_fold)
    
    # Make predictions on the validation set and calculate MSE
    y_pred = catboost_model.predict(X_val_fold_pca)
    mse = np.mean((y_val_fold - y_pred)**2)
    
    # Append the MSE to the list
    mse_list.append(mse)
    
# Print the mean of the MSEs
print("Mean MSE:", np.mean(mse_list))

Learning rate set to 0.074361
0:	learn: 62147.9911531	total: 66.2ms	remaining: 1m 6s
1:	learn: 59625.2454629	total: 75.4ms	remaining: 37.6s
2:	learn: 57250.4641204	total: 83.9ms	remaining: 27.9s
3:	learn: 55115.7712321	total: 92.5ms	remaining: 23s
4:	learn: 53171.6829221	total: 101ms	remaining: 20.1s
5:	learn: 51428.7285718	total: 110ms	remaining: 18.2s
6:	learn: 49825.1913798	total: 118ms	remaining: 16.7s
7:	learn: 48317.3792351	total: 126ms	remaining: 15.7s
8:	learn: 46999.7535935	total: 135ms	remaining: 14.8s
9:	learn: 45771.0651652	total: 143ms	remaining: 14.2s
10:	learn: 44656.1165786	total: 151ms	remaining: 13.6s
11:	learn: 43597.1526465	total: 159ms	remaining: 13.1s
12:	learn: 42677.4989858	total: 168ms	remaining: 12.7s
13:	learn: 41818.4157297	total: 175ms	remaining: 12.4s
14:	learn: 41000.0674392	total: 186ms	remaining: 12.2s
15:	learn: 40230.4324437	total: 194ms	remaining: 11.9s
16:	learn: 39530.8383919	total: 203ms	remaining: 11.7s
17:	learn: 38919.3407733	total: 211ms	remai

In [6]:
y_4b=scaler.fit_transform(test)
y_4b=pca.transform(y_4b)
y_4b = catboost_model.predict(y_4b)
y_4b

array([108783.56616806,  96520.03519213,  92682.47132072, ...,
       220632.02597474,  79918.50483423, 184911.55708581])

In [7]:
csv_file=pd.read_csv("/kaggle/input/simple/SampleSubmission.csv")
csv_file['Operational Energy']=y_4b
csv_file.to_csv("simplesimple1235.csv",index=False)

In [8]:
importances = catboost_model.feature_importances_

# Sort feature importances in descending order
sorted_importances = sorted(zip(importances, X.columns), reverse=True)

# Print the feature importances
for importance, feature in sorted_importances:
    print(f"{feature}: {importance}")


EUI: 28.842105291269156
Cooling Setpoint: 25.621316955350615
Cooling COP: 18.952872572291497
Operating Hours: 13.779914626381217
WWR: 6.142445086391116
Equipment Heat Gain: 1.9495244382294148
Internal Floor Rt: 1.5409266229003302
Ground Floor Rt: 1.2692944449596957
Internal Wall Rt: 1.1702081095327295
Infiltration: 0.7313918526942295


In [9]:
importance_scores = catboost_model.feature_importances_
feature_importances = dict(zip(X.columns, importance_scores))
top_10_features = sorted(feature_importances, key=feature_importances.get, reverse=True)[:10]

# Select only the top 5 features
X = X[top_10_features]
test = test[top_10_features]

# Set up the KFold object
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize the list to store the mean squared errors (MSEs)
mse_list = []

# Loop over the folds
for train_index, val_index in kf.split(X):
    
    # Split the data into training and validation sets
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    # Scale the features
    scaler = StandardScaler()
    X_train_fold = scaler.fit_transform(X_train_fold)
    X_val_fold = scaler.transform(X_val_fold)
    
    # Reduce dimensionality using PCA
    pca = PCA(n_components=2)
    X_train_fold = pca.fit_transform(X_train_fold)
    X_val_fold = pca.transform(X_val_fold)

    # Train the model and predict on the validation set
    catboost_model.fit(X_train_fold, y_train_fold)
    y_pred = catboost_model.predict(X_val_fold)

    # Compute the mean squared error
    mse = np.mean((y_val_fold - y_pred)**2)
    mse_list.append(mse)

# Compute the average mean squared error over the folds
avg_mse = np.mean(mse_list)
print("Average MSE:", avg_mse)

Learning rate set to 0.074361
0:	learn: 64795.2872848	total: 5.85ms	remaining: 5.84s
1:	learn: 64643.4874670	total: 10.9ms	remaining: 5.42s
2:	learn: 64509.3835971	total: 15.7ms	remaining: 5.22s
3:	learn: 64392.7777998	total: 20.8ms	remaining: 5.18s
4:	learn: 64293.9404953	total: 25.7ms	remaining: 5.12s
5:	learn: 64203.3332614	total: 30.6ms	remaining: 5.07s
6:	learn: 64125.7181876	total: 35.5ms	remaining: 5.04s
7:	learn: 64044.4004639	total: 40.4ms	remaining: 5.01s
8:	learn: 63978.2419603	total: 45.3ms	remaining: 4.98s
9:	learn: 63923.8944328	total: 50.9ms	remaining: 5.04s
10:	learn: 63872.9056110	total: 55.9ms	remaining: 5.03s
11:	learn: 63827.5413140	total: 60.7ms	remaining: 5s
12:	learn: 63788.0809831	total: 65.6ms	remaining: 4.98s
13:	learn: 63753.2939794	total: 70.6ms	remaining: 4.97s
14:	learn: 63722.1991724	total: 75.5ms	remaining: 4.96s
15:	learn: 63691.8352947	total: 80.1ms	remaining: 4.93s
16:	learn: 63669.1394891	total: 85ms	remaining: 4.91s
17:	learn: 63646.2894093	total: 8

In [10]:
y_4bb=scaler.fit_transform(test)
y_4bb=pca.transform(y_4bb)
y_4bb = catboost_model.predict(y_4bb)
y_4bb

array([161067.70143134, 153519.22157301, 196212.38260852, ...,
       150230.56540086, 165699.08288227, 148809.89690354])

In [11]:
csv_file=pd.read_csv("/kaggle/input/simple/SampleSubmission.csv")
csv_file['Operational Energy']=y_4bb
csv_file.to_csv("simplesimple123.csv",index=False)