In [25]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import re 
import json
import xgboost as xgb
from sklearn.model_selection import KFold


In [26]:
def preprocess(df):
    # Replace single quotes with double quotes in "random load mesures" column
    df["random load mesures"] = df["random load mesures"].str.replace("'",'"')
    
    # Remove prefix from building and file columns and convert to int
    df['building'] = df['building'].str.replace("Building_","").astype(int)
    df['File'] = df['File'].str.replace("Shape0_","").astype(int)
    
    # Extract WWR feature using regular expression
    pattern = r'\d+\.\d+'
    wwr = df['WWR'].apply(lambda x: max([float(m) for m in re.findall(pattern, x)]))
    df['WWR'] = wwr
    
    # Extract Cooling and Lights features using JSON
    features = df['random load mesures'].apply(lambda x: json.loads(x))
    cooling = features.apply(lambda x: float(x['Cooling'].replace(":C","")))
    lights = features.apply(lambda x: float(x['Lights'].replace(":C","")))
    df['Coolings'] = cooling
    df['Lights'] = lights
    # Load the dataset into a pandas DataFrame

    # Create a new feature: Overall Energy Consumption
    df['Overall Energy Consumption'] = df['Operating Hours'] * df['EUI']

    # Create a new feature: Impact of Lighting on Heat Load
    df['Lighting Impact on Heat Load'] = df['Lights'] * df['Light Heat Gain']

    # Create a new feature: Energy Efficiency in relation to Building Size
    df['Energy Efficiency'] = df['EUI'] / df['Number of Floors']

    # Create a new feature: Rate of Heat Addition through Lighting
    df['Lighting Heat Addition Rate'] = df['Light Heat Gain'] / df['Operating Hours']

    # Create a new feature: Logarithm of EUI
    df['Logarithm EUI'] = np.log(df['EUI'])

    # Create a new feature: Building Size Categories
    bins = [0, 5, 10, np.inf]
    #labels = ['Small', 'Medium', 'Large']
    #%df['Building Size Category'] = pd.cut(df['Number of Floors'], bins=bins, labels=labels)

    # Create interaction features
    interaction_features = ['Overall Energy Consumption', 'Lighting Impact on Heat Load', 'Energy Efficiency']
    for feature1 in interaction_features:
        for feature2 in interaction_features:
            if feature1 != feature2:
                interaction_feature_name = f'{feature1} x {feature2}'
                df[interaction_feature_name] = df[feature1] * df[feature2]

    # Save the updated dataset
    #df.to_csv('improved_dataset.csv', index=False)

    # Drop unnecessary columns
    df = df.drop(['random load mesures', 'Permeability','File','building'], axis=1)
    return df


In [27]:
train = pd.read_csv("/kaggle/input/traintrain/Train.csv")
test = pd.read_csv("/kaggle/input/testing/Test (1).csv")

# Preprocess train and test data
train = preprocess(train)
test = preprocess(test)



In [28]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Define the features and target
X = train.drop(['Operational Energy'], axis=1)
y = train['Operational Energy']

# Set up the KFold object
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize the list to store the mean squared errors (MSEs)
mse_list = []

In [29]:
for train_index, val_index in kf.split(X):
    
    # Split the data into training and validation sets
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
    
    # Scale the data using StandardScaler
    scaler = StandardScaler()
    X_train_fold_scaled = scaler.fit_transform(X_train_fold)
    X_val_fold_scaled = scaler.transform(X_val_fold)
    
    # Apply PCA to reduce dimensionality
    pca = PCA(n_components=10)
    X_train_fold_pca = pca.fit_transform(X_train_fold_scaled)
    X_val_fold_pca = pca.transform(X_val_fold_scaled)
    
    # Train the model
    model.fit(X_train_fold_pca, y_train_fold)
    
    # Make predictions on the validation set and calculate MSE
    y_pred = model.predict(X_val_fold_pca)
    mse = np.mean((y_val_fold - y_pred)**2)
    
    # Append the MSE to the list
    mse_list.append(mse)
    
# Print the mean of the MSEs
print("Mean MSE:", np.mean(mse_list))

Mean MSE: 680888280.0198196


In [30]:
importances = model.feature_importances_

# Sort feature importances in descending order
sorted_importances = sorted(zip(importances, X.columns), reverse=True)

# Print the feature importances
for importance, feature in sorted_importances:
    print(f"{feature}: {importance}")


EUI: 0.3599299350382553
Cooling Setpoint: 0.2575559166030452
Cooling COP: 0.13696919245982667
Operating Hours: 0.09945544782219912
WWR: 0.05492314417299849
Equipment Heat Gain: 0.020734228438313615
Internal Floor Rt: 0.01876654600654251
Infiltration: 0.018609734430185383
Ground Floor Rt: 0.0172027538294306
Internal Wall Rt: 0.015853101199202982


In [31]:
# Load train and test data
train = pd.read_csv("/kaggle/input/traintrain/Train.csv")
test = pd.read_csv("/kaggle/input/testing/Test (1).csv")

# Preprocess train and test data
train = preprocess(train)
test = preprocess(test)

# Initialize the model
model = xgb.XGBRegressor(objective='reg:squarederror')

# Define the features and target
X = train.drop(['Operational Energy'], axis=1)
y = train['Operational Energy']

# Feature selection using XGBoost
model.fit(X, y)
importance_scores = model.feature_importances_
feature_importances = dict(zip(X.columns, importance_scores))
top_5_features = sorted(feature_importances, key=feature_importances.get, reverse=True)[:5]

# Select only the top 5 features
X = X[top_5_features]
test = test[top_5_features]

# Set up the KFold object
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize the list to store the mean squared errors (MSEs)
mse_list = []

# Loop over the folds
for train_index, val_index in kf.split(X):
    
    # Split the data into training and validation sets
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    # Scale the features
    scaler = StandardScaler()
    X_train_fold = scaler.fit_transform(X_train_fold)
    X_val_fold = scaler.transform(X_val_fold)
    
    # Reduce dimensionality using PCA
    pca = PCA(n_components=2)
    X_train_fold = pca.fit_transform(X_train_fold)
    X_val_fold = pca.transform(X_val_fold)

    # Train the model and predict on the validation set
    model.fit(X_train_fold, y_train_fold)
    y_pred = model.predict(X_val_fold)

    # Compute the mean squared error
    mse = np.mean((y_val_fold - y_pred)**2)
    mse_list.append(mse)

# Compute the average mean squared error over the folds
avg_mse = np.mean(mse_list)
print("Average MSE:", avg_mse)


Average MSE: 659031181.7634231


In [32]:
y_pris1=scaler.fit_transform(test)
y_pris1=pca.transform(y_pris1)
y_pris1 = model.predict(y_pris1)
y_pris1

array([ 90313.336,  85569.72 ,  84374.46 , ..., 248714.2  ,  71751.98 ,
       202789.73 ], dtype=float32)

In [34]:
csv_file=pd.read_csv("/kaggle/input/simple/SampleSubmission.csv")
csv_file['Operational Energy']=y_pris1
csv_file.to_csv("simplesimple100+.csv",index=False)