In [40]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import re 
import json
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from scipy import stats



In [41]:
def preprocess(data):
    # Replace single quotes with double quotes in "random load mesures" column
    #data["random load measures"] = data["random load measures"].str.replace("'",'"')
    data['building'] = le.fit_transform(data['building'])
    data['Town'] = le.fit_transform(data['Town'])
    data['Total Wall Area'] = data['Wall Rt'] * data['Height'] * data['Number of Floors']
    #data['Total Window Area'] = data['Windows Rt'] * data['Height'] * data['WWR'] * data['Number of Floors']
    data['Total Floor Area'] = data['Ground Floor Rt'] * data['Height']
    data['Total Roof Area'] = data['Roof Rt'] * data['Height']
    data['Total Internal Wall Area'] = data['Internal Wall Rt'] * data['Height'] * data['Number of Floors']
    data['Total Internal Floor Area'] = data['Internal Floor Rt'] * data['Height']
    data['Overall Energy Consumption'] = data['Operating Hours'] * data['EUI']

    
    pattern = r'\d+\.\d+'
    wwr = data['WWR'].apply(lambda x: max([float(m) for m in re.findall(pattern, x)]))
    data['WWR'] = wwr
    # assuming 'File' contains unique file names for each building shape, it can be removed
    
    
    features = data['random load mesures'].apply(lambda x: json.loads(x))
    cooling = features.apply(lambda x: float(x['Cooling'].replace(":C","")))
    lights = features.apply(lambda x: float(x['Lights'].replace(":C","")))
    data['Coolings'] = cooling
    data['Lights'] = lights
    
    data['Lighting Impact on Heat Load'] = data['Lights'] * data['Light Heat Gain']
    data['Energy Efficiency'] = data['EUI'] / data['Number of Floors']
    data['Lighting Heat Addition Rate'] = data['Light Heat Gain'] / data['Operating Hours']
    

    scaler = StandardScaler()
    data[['Cooling Setpoint','Coolings','Lights', 'EUI', 'Cooling COP', 'Operating Hours', 'WWR',      'Equipment Heat Gain', 'Internal Wall Rt', 'Internal Floor Rt', 'Infiltration','Ground Floor Rt', 'Number of Floors', 'Occupancy', 'Light Heat Gain', 'Windows Rt',      'Height', 'Heating COP', 'Heating Setpoint', 'Wall Rt', 'Start Time', 'windows g-value',      'Roof Rt', 'Boiler Efficiency', 'Internal Mass', 'Permeability', 'Total Floors Area',]] = scaler.fit_transform(data[['Cooling Setpoint','Coolings','Lights', 'EUI', 'Cooling COP', 'Operating Hours', 'WWR',                                                              'Equipment Heat Gain', 'Internal Wall Rt', 'Internal Floor Rt', 'Infiltration',                                                              'Ground Floor Rt', 'Number of Floors', 'Occupancy', 'Light Heat Gain', 'Windows Rt',                                                              'Height', 'Heating COP', 'Heating Setpoint', 'Wall Rt', 'Start Time', 'windows g-value',                                                              'Roof Rt', 'Boiler Efficiency', 'Internal Mass', 'Permeability', 'Total Floors Area']])
    
    interaction_features = ['Overall Energy Consumption', 'Lighting Impact on Heat Load', 'Energy Efficiency','Lighting Heat Addition Rate']
    for feature1 in interaction_features:
        for feature2 in interaction_features:
            if feature1 != feature2:
                interaction_feature_name = f'{feature1} x {feature2}'
                data[interaction_feature_name] = data[feature1] * data[feature2]

    # assuming we want to select the top 10 features with the highest F-test score
    selector = SelectKBest(f_regression, k=10)
    data = data.drop(['random load mesures', 'Permeability','File','building'], axis=1)
    #data = selector.fit_transform(data.drop('Operational Energy', axis=1), data['Operational Energy'])
    # assume your data is stored in a NumPy array called `data`
    for col in data.columns:
        if col == 'Operational Energy':
            continue  # skip target variable
        median = np.median(data[col])
        q1 = np.percentile(data[col], 25)
        q3 = np.percentile(data[col], 75)
        iqr = q3 - q1
        upper_bound = q3 + 1.5 * iqr
        lower_bound = q1 - 1.5 * iqr
        data.loc[data[col] > upper_bound, col] = median
        data.loc[data[col] < lower_bound, col] = median

    return data


In [None]:
train=pd.read_csv("/kaggle/input/traintrain/Train.csv")
test = pd.read_csv("/kaggle/input/testtest/Test (2).csv")
csv_file = pd.DataFrame()
csv_file['submission id'] = test['building'] + '_Town_' + test['Town'].astype(str)

csv_file.head()
csv_file1 = pd.DataFrame()
csv_file1['submission id'] = test['building'] + '_Town_' + test['Town'].astype(str)

csv_file1.shape

In [None]:

# Preprocess train and test data
train = preprocess(train)
print(test.shape)
test = preprocess(test)
print(test.shape)



In [None]:
#diff_cols = set(test.columns).difference(X.columns)
#print(test['Operational Energy'])
#test=test.drop('Operational Energy',axis=1)
test.shape

In [None]:
from catboost import CatBoostRegressor


catboost_model = CatBoostRegressor()

X = train.drop(['Operational Energy'], axis=1)
#test=test.drop(['Operational Energy'])
y = train['Operational Energy']
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize the list to store the mean squared errors (MSEs)
mse_list = []
test.shape

In [None]:
for train_index, val_index in kf.split(X):
    
    # Split the data into training and validation sets
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
    
    # Scale the data using StandardScaler
    scaler = StandardScaler()
    X_train_fold_scaled = scaler.fit_transform(X_train_fold)
    X_val_fold_scaled = scaler.transform(X_val_fold)
    
    # Apply PCA to reduce dimensionality
    pca = PCA(n_components=10)
    X_train_fold_pca = pca.fit_transform(X_train_fold_scaled)
    X_val_fold_pca = pca.transform(X_val_fold_scaled)
    
    # Train the model
    catboost_model.fit(X_train_fold_pca, y_train_fold)
    
    # Make predictions on the validation set and calculate MSE
    y_pred = catboost_model.predict(X_val_fold_pca)
    mse = np.mean((y_val_fold - y_pred)**2)
    
    # Append the MSE to the list
    mse_list.append(mse)
    
# Print the mean of the MSEs
print("Mean MSE:", np.mean(mse_list))

In [None]:
y_4b=scaler.fit_transform(test)
y_4b=pca.transform(y_4b)
y_4b = catboost_model.predict(y_4b)
y_4b

In [None]:
csv_file['Operational Energy']=y_4b
csv_file.to_csv("simplesimple1235.csv",index=False)
csv_file.shape

In [None]:
importances = catboost_model.feature_importances_

# Sort feature importances in descending order
sorted_importances = sorted(zip(importances, X.columns), reverse=True)

# Print the feature importances
for importance, feature in sorted_importances:
    print(f"{feature}: {importance}")


In [None]:
importance_scores = catboost_model.feature_importances_
feature_importances = dict(zip(X.columns, importance_scores))
top_8_features = sorted(feature_importances, key=feature_importances.get, reverse=True)[:8]

# Select only the top 5 features
X = X[top_8_features]
test = test[top_8_features]

# Set up the KFold object
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize the list to store the mean squared errors (MSEs)
mse_list = []

# Loop over the folds
for train_index, val_index in kf.split(X):
    
    # Split the data into training and validation sets
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    # Scale the features
    scaler = StandardScaler()
    X_train_fold = scaler.fit_transform(X_train_fold)
    X_val_fold = scaler.transform(X_val_fold)
    
    # Reduce dimensionality using PCA
    pca = PCA(n_components=2)
    X_train_fold = pca.fit_transform(X_train_fold)
    X_val_fold = pca.transform(X_val_fold)

    # Train the model and predict on the validation set
    catboost_model.fit(X_train_fold, y_train_fold)
    y_pred = catboost_model.predict(X_val_fold)

    # Compute the mean squared error
    mse = np.mean((y_val_fold - y_pred)**2)
    mse_list.append(mse)

# Compute the average mean squared error over the folds
avg_mse = np.mean(mse_list)
print("Average MSE:", avg_mse)

In [None]:
y_test=scaler.fit_transform(test)
y_test_pca=pca.transform(y_test)
y_test_last = catboost_model.predict(y_test_pca)
y_test_last.shape

In [None]:
csv_file1['Operational Energy']=y_test_last
csv_file1.to_csv("top10.csv",index=False)
csv_file1.shape

In [43]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# load the dataset
df = pd.read_csv('/kaggle/input/traintrain/Train.csv')
test = pd.read_csv('/kaggle/input/testtest/Test (2).csv')

# preprocess the data
train = preprocess(train)
test = preprocess(test)

# select features and target variable

X = train.drop(['Operational Energy'], axis=1)
#test=test.drop(['Operational Energy'])
y = train['Operational Energy']

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create polynomial features of degree 2
poly_features = PolynomialFeatures(degree=2)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)
test_poly = poly_features.transform(test)

# apply L2 regularization using Ridge regression
alpha = 0.1  # regularization strength
model = Ridge(alpha=alpha)
model.fit(X_train_poly, y_train)

# make predictions on the test set
y_pred = model.predict(X_test_poly)

# evaluate the model's performance
mse = mean_squared_error(y_pred, y_test)
print("Mean squared error:", mse)


KeyError: 'building'

In [None]:
csv_file2 = pd.DataFrame()
csv_file2['submission id'] = test['building'] + '_Town_' + test['Town'].astype(str)
csv_file2['Operational Energy']=y_test_last
csv_file2.to_csv("top10.csv",index=False)
csv_file2.shape