In [53]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
import re 
import json
#import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from scipy import stats



In [54]:
def preprocess(data):
    # Replace single quotes with double quotes in "random load mesures" column
    data["random load mesures"] = data["random load mesures"].str.replace("'",'"')
    data['building'] = le.fit_transform(data['building'])
    data['Town'] = le.fit_transform(data['Town'])
    data['Total Wall Area'] = data['Wall Rt'] * data['Height'] * data['Number of Floors']
    #data['Total Window Area'] = data['Windows Rt'] * data['Height'] * data['WWR'] * data['Number of Floors']
    data['Total Floor Area'] = data['Ground Floor Rt'] * data['Height']
    data['Total Roof Area'] = data['Roof Rt'] * data['Height']
    data['Total Internal Wall Area'] = data['Internal Wall Rt'] * data['Height'] * data['Number of Floors']
    data['Total Internal Floor Area'] = data['Internal Floor Rt'] * data['Height']
    data['Overall Energy Consumption'] = data['Operating Hours'] * data['EUI']

    
    pattern = r'\d+\.\d+'
    wwr = data['WWR'].apply(lambda x: max([float(m) for m in re.findall(pattern, x)]))
    data['WWR'] = wwr
    # assuming 'File' contains unique file names for each building shape, it can be removed
    
    
    features = data['random load mesures'].apply(lambda x: json.loads(x))
    cooling = features.apply(lambda x: float(x['Cooling'].replace(":C","")))
    lights = features.apply(lambda x: float(x['Lights'].replace(":C","")))
    data['Coolings'] = cooling
    data['Lights'] = lights
    
    data['Lighting Impact on Heat Load'] = data['Lights'] * data['Light Heat Gain']
    data['Energy Efficiency'] = data['EUI'] / data['Number of Floors']
    data['Lighting Heat Addition Rate'] = data['Light Heat Gain'] / data['Operating Hours']
    

    scaler = StandardScaler()
    data[['Cooling Setpoint','Coolings','Lights', 'EUI', 'Cooling COP', 'Operating Hours', 'WWR',      'Equipment Heat Gain', 'Internal Wall Rt', 'Internal Floor Rt', 'Infiltration','Ground Floor Rt', 'Number of Floors', 'Occupancy', 'Light Heat Gain', 'Windows Rt',      'Height', 'Heating COP', 'Heating Setpoint', 'Wall Rt', 'Start Time', 'windows g-value',      'Roof Rt', 'Boiler Efficiency', 'Internal Mass', 'Permeability', 'Total Floors Area',]] = scaler.fit_transform(data[['Cooling Setpoint','Coolings','Lights', 'EUI', 'Cooling COP', 'Operating Hours', 'WWR',                                                              'Equipment Heat Gain', 'Internal Wall Rt', 'Internal Floor Rt', 'Infiltration',                                                              'Ground Floor Rt', 'Number of Floors', 'Occupancy', 'Light Heat Gain', 'Windows Rt',                                                              'Height', 'Heating COP', 'Heating Setpoint', 'Wall Rt', 'Start Time', 'windows g-value',                                                              'Roof Rt', 'Boiler Efficiency', 'Internal Mass', 'Permeability', 'Total Floors Area']])
    
    interaction_features = ['Overall Energy Consumption', 'Lighting Impact on Heat Load', 'Energy Efficiency','Lighting Heat Addition Rate']
    for feature1 in interaction_features:
        for feature2 in interaction_features:
            if feature1 != feature2:
                interaction_feature_name = f'{feature1} x {feature2}'
                data[interaction_feature_name] = data[feature1] * data[feature2]

    # assuming we want to select the top 10 features with the highest F-test score
    selector = SelectKBest(f_regression, k=10)
    data = data.drop(['random load mesures', 'Permeability','File','building'], axis=1)
    #data = selector.fit_transform(data.drop('Operational Energy', axis=1), data['Operational Energy'])
    # assume your data is stored in a NumPy array called `data`
    for col in data.columns:
        if col == 'Operational Energy':
            continue  # skip target variable
        median = np.median(data[col])
        q1 = np.percentile(data[col], 25)
        q3 = np.percentile(data[col], 75)
        iqr = q3 - q1
        upper_bound = q3 + 1.5 * iqr
        lower_bound = q1 - 1.5 * iqr
        data.loc[data[col] > upper_bound, col] = median
        data.loc[data[col] < lower_bound, col] = median

    return data


In [55]:
train = pd.read_csv("/kaggle/input/traintrain/Train.csv")
test = pd.read_csv("/kaggle/input/testing/Test (1).csv")

subb = pd.DataFrame()
subb['submission id'] = test['building'] + '_Town_' + test['Town'].astype(str)




In [56]:
# Preprocess train and test data
train = preprocess(train)
test = preprocess(test)

In [58]:
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV

hyperparameter_grid = {'iterations': [100, 500, 1000],
                       'learning_rate': [0.01, 0.1, 0.5],
                       'depth': [4, 6, 8],
                       'l2_leaf_reg': [1, 3, 5, 7, 9]}

catboost_model = CatBoostRegressor(loss_function='RMSE')

randomized_search = RandomizedSearchCV(estimator=catboost_model,
                                       param_distributions=hyperparameter_grid,
                                       n_iter=10,
                                       scoring='neg_mean_squared_error',
                                       cv=5,
                                       verbose=2,
                                       random_state=42)

X = train.drop(['Operational Energy'], axis=1)
y = train['Operational Energy']
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize the list to store the mean squared errors (MSEs)
mse_list = []

In [None]:
for train_index, val_index in kf.split(X):
    
    # Split the data into training and validation sets
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
    
    # Scale the data using StandardScaler
    scaler = StandardScaler()
    X_train_fold_scaled = scaler.fit_transform(X_train_fold)
    X_val_fold_scaled = scaler.transform(X_val_fold)
    
    # Apply PCA to reduce dimensionality
    pca = PCA(n_components=10)
    X_train_fold_pca = pca.fit_transform(X_train_fold_scaled)
    X_val_fold_pca = pca.transform(X_val_fold_scaled)
    
    # Train the model
    catboost_model.fit(X_train_fold_pca, y_train_fold)
    randomized_search.fit(X_train_fold_pca, y_train_fold)
    # Make predictions on the validation set and calculate MSE
    y_pred = catboost_model.predict(X_val_fold_pca)
    mse = mean_squared_error(y_val_fold, y_pred)
    # Append the MSE to the list
    mse_list.append(mse)
    
# Print the mean of the MSEs
print('Best hyperparameters:', randomized_search.best_params_)

print("Mean MSE:", np.mean(mse_list))

Learning rate set to 0.074361
0:	learn: 62498.8942033	total: 5.22ms	remaining: 5.22s
1:	learn: 60241.4912719	total: 9.36ms	remaining: 4.67s
2:	learn: 58194.0376525	total: 13.5ms	remaining: 4.5s
3:	learn: 56379.7475905	total: 17.5ms	remaining: 4.36s
4:	learn: 54669.5596631	total: 21.4ms	remaining: 4.27s
5:	learn: 53116.2889236	total: 25.4ms	remaining: 4.2s
6:	learn: 51775.3949169	total: 29.3ms	remaining: 4.16s
7:	learn: 50558.3943366	total: 33.2ms	remaining: 4.11s
8:	learn: 49427.6630250	total: 37.6ms	remaining: 4.14s
9:	learn: 48390.4664225	total: 41.5ms	remaining: 4.11s
10:	learn: 47466.6949035	total: 45.5ms	remaining: 4.09s
11:	learn: 46636.7189452	total: 49.4ms	remaining: 4.06s
12:	learn: 45908.0468000	total: 53.2ms	remaining: 4.04s
13:	learn: 45239.6129436	total: 57.1ms	remaining: 4.02s
14:	learn: 44590.3713862	total: 61ms	remaining: 4s
15:	learn: 44005.8324312	total: 64.7ms	remaining: 3.98s
16:	learn: 43481.7280528	total: 68.7ms	remaining: 3.97s
17:	learn: 43000.4287989	total: 72.

In [None]:
y_4b=scaler.fit_transform(test)
y_4b=pca.transform(y_4b)
y_4b = catboost_model.predict(y_4b)
y_4b

In [None]:
#subb['Operational Energy']=y_4b
#subb.to_csv("simplesst.csv",index=False)


In [None]:
importances = catboost_model.feature_importances_

# Sort feature importances in descending order
sorted_importances = sorted(zip(importances, X.columns), reverse=True)

# Print the feature importances
for importance, feature in sorted_importances:
    print(f"{feature}: {importance}")


In [None]:
importance_scores = catboost_model.feature_importances_
feature_importances = dict(zip(X.columns, importance_scores))
top_8_features = sorted(feature_importances, key=feature_importances.get, reverse=True)[:8]

# Select only the top 5 features
X = X[top_8_features]
test = test[top_8_features]

# Set up the KFold object
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize the list to store the mean squared errors (MSEs)
mse_list = []

# Loop over the folds
for train_index, val_index in kf.split(X):
    
    # Split the data into training and validation sets
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    # Scale the features
    scaler = StandardScaler()
    X_train_fold = scaler.fit_transform(X_train_fold)
    X_val_fold = scaler.transform(X_val_fold)
    
    # Reduce dimensionality using PCA
    pca = PCA(n_components=2)
    X_train_fold = pca.fit_transform(X_train_fold)
    X_val_fold = pca.transform(X_val_fold)

    # Train the model and predict on the validation set
    catboost_model.fit(X_train_fold, y_train_fold)
    y_pred = catboost_model.predict(X_val_fold)

    # Compute the mean squared error
    mse = mean_squared_error(y_val_fold, y_pred)
    mse_list.append(mse)

# Compute the average mean squared error over the folds
avg_mse = np.mean(mse_list)
print("Average MSE:", avg_mse)

In [None]:
y_4bb=scaler.fit_transform(test)
y_4bb=pca.transform(y_4bb)
y_4bb = catboost_model.predict(y_4bb)
y_4bb

In [None]:
subb['Operational Energy']=y_4bb
subb.to_csv("top4545.csv",index=False)

# **using catboost **

In [25]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp38-cp38-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting plotly
  Downloading plotly-5.14.1-py2.py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 KB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: plotly, graphviz, catboost
Successfully installed catboost-1.2 graphviz-0.20.1 plotly-5.14.1
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [40]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Assuming you have your feature matrix X and target variable y
train = pd.read_csv("/kaggle/input/traintrain/Train.csv")
test = pd.read_csv("/kaggle/input/testing/Test (1).csv")



In [41]:
csv_file = pd.DataFrame()
csv_file['submission id'] = test['building'] + '_Town_' + test['Town'].astype(str)
csv_file2 = pd.DataFrame()
csv_file2['submission id'] = test['building'] + '_Town_' + test['Town'].astype(str)

In [42]:
# Preprocess train and test data
train = preprocess(train)
test = preprocess(test)



In [43]:

# x and y
X = train.drop(['Operational Energy'], axis=1)
y = train['Operational Energy']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Initialize the base estimator
model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6)
bagging = model

# Initialize the bagging regressor
#bagging = BaggingRegressor(base_estimator=base_estimator, n_estimators=10, random_state=42)

# Fit the bagging regressor to the training data
bagging.fit(X_train, y_train)

# Make predictions on the test data
y_pred = bagging.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

0:	learn: 59758.3970000	total: 5.42ms	remaining: 5.41s
1:	learn: 54960.7180318	total: 10.6ms	remaining: 5.3s
2:	learn: 50538.3023443	total: 15.4ms	remaining: 5.12s
3:	learn: 46677.6141513	total: 20.4ms	remaining: 5.07s
4:	learn: 43015.5249180	total: 25.2ms	remaining: 5.02s
5:	learn: 39768.9987489	total: 30ms	remaining: 4.97s
6:	learn: 36923.1274989	total: 34.5ms	remaining: 4.89s
7:	learn: 34425.0345963	total: 39.1ms	remaining: 4.85s
8:	learn: 32009.0460591	total: 43.7ms	remaining: 4.81s
9:	learn: 29758.8856080	total: 48.7ms	remaining: 4.82s
10:	learn: 27847.8906082	total: 53.5ms	remaining: 4.81s
11:	learn: 26107.1783995	total: 57.9ms	remaining: 4.77s
12:	learn: 24571.0048111	total: 63.3ms	remaining: 4.81s
13:	learn: 23112.2409315	total: 69.4ms	remaining: 4.88s
14:	learn: 21848.3477164	total: 73.9ms	remaining: 4.85s
15:	learn: 20683.7966238	total: 78.7ms	remaining: 4.84s
16:	learn: 19683.2476253	total: 83.3ms	remaining: 4.81s
17:	learn: 18774.1203789	total: 87.9ms	remaining: 4.79s
18:	l

In [45]:
y_last2 = bagging.predict(test)
y_last2 = y_last.astype(int)

y_last2

array([ 77808, 107816,  91893, ..., 231416,  63109, 184771])

In [47]:

csv_file3['Operational Energy']=y_last2
csv_file3.to_csv("sub3.csv",index=False)
csv_file3.head()

NameError: name 'csv_file3' is not defined

In [33]:
dim=pd.read_csv("/kaggle/working/sub2.csv")
dim.head()

Unnamed: 0,submission id,Operational Energy
0,Building_1_Town_1,77808
1,Building_100_Town_1,107816
2,Building_1000_Town_2,91893
3,Building_10000_Town_0,342155
4,Building_10005_Town_2,239923
