#Imports

In [None]:
!pip install catboost



In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, ElasticNet,
                                  LogisticRegression, RidgeClassifier)
from sklearn.tree import (DecisionTreeRegressor, DecisionTreeClassifier)
from sklearn.ensemble import (RandomForestRegressor, RandomForestClassifier,
                              GradientBoostingClassifier, AdaBoostClassifier,
                              AdaBoostRegressor)
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis,
                                            QuadraticDiscriminantAnalysis)
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.linear_model import HuberRegressor, BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#Pre-Processing

In [None]:
strength = train.iloc[:, -1].values
data = train.iloc[:, :-1].values

In [None]:
print(data.dtype)
data = pd.DataFrame(data)
print(data.columns)

object
RangeIndex(start=0, stop=6, step=1)


In [None]:
print(data)

         0            1       2       3    4     5
0    B0001   Commercial  332.15   37.44   16  2930
1    B0002   Industrial  234.68  109.89   53  1122
2    B0003   Commercial  320.71    48.0  269  4852
3    B0004   Industrial   66.93  126.21  219  4069
4    B0005   Industrial  107.63   53.15   76   894
..     ...          ...     ...     ...  ...   ...
995  B0996  Residential   116.5   56.17  235  1807
996  B0997   Commercial  270.25    8.64   16  2857
997  B0998  Residential  464.39    5.36  222  2824
998  B0999  Residential  470.37   96.77   30  3661
999  B1000   Commercial  487.05  147.75   16  3053

[1000 rows x 6 columns]


In [None]:
data = pd.DataFrame(data)
print(data.shape)
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

data_encoded = encoder.fit_transform(data[[0, 1]])

encoded_columns = encoder.get_feature_names_out(input_features=[str(0), str(1)])
encoded_df = pd.DataFrame(data_encoded, columns=encoded_columns)

numeric_columns = data.drop([0, 1], axis=1)
data = pd.concat([encoded_df, numeric_columns], axis=1)

(1000, 6)


In [None]:
print(data.shape)

(1000, 1007)


#Models


In [None]:
models = {
    # Regressors
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet Regression': ElasticNet(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Support Vector Regression': SVR(),
    'K-Nearest Neighbors Regressor': KNeighborsRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    'XGBoost Regressor': XGBRegressor(),
    'LightGBM Regressor': LGBMRegressor(),
    'CatBoost Regressor': CatBoostRegressor(learning_rate=0.1, iterations=1000, depth=6),
    'AdaBoost Regressor': AdaBoostRegressor(),
    'Huber Regressor': HuberRegressor(),
    'Bayesian Ridge Regression': BayesianRidge(),

    # Classifiers
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors Classifier': KNeighborsClassifier(),
    'Support Vector Classifier': SVC(),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'Gradient Boosting Classifier': GradientBoostingClassifier(),
    'XGBoost Classifier': XGBClassifier(),
    'LightGBM Classifier': LGBMClassifier(),
    'CatBoost Classifier': CatBoostClassifier(learning_rate=0.1, iterations=1000, depth=6),
    'Naive Bayes Classifier': GaussianNB(),
    'AdaBoost Classifier': AdaBoostClassifier(),
    'MLP Classifier': MLPClassifier(),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis(),
    'Ridge Classifier': RidgeClassifier()
}


# Pre-Training


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(data, strength, test_size=0.2, random_state=42)

X = data
y = strength

In [None]:
print(X.shape)
print(y.shape)
print(X_train.shape)
print(y_train.shape)

(1000, 1007)
(1000,)
(800, 1007)
(800,)


In [None]:
print(X.columns)
print(X_train.dtypes)

X_train.columns = X_train.columns.astype(str)
X_val.columns = X_val.columns.astype(str)

print(y_train[:5])
print(y_val[:5])

print(X_train.shape)
print(y_train.shape)

Index([      '0_B0001',       '0_B0002',       '0_B0003',       '0_B0004',
             '0_B0005',       '0_B0006',       '0_B0007',       '0_B0008',
             '0_B0009',       '0_B0010',
       ...
             '0_B0998',       '0_B0999',       '0_B1000',  '1_Commercial',
        '1_Industrial', '1_Residential',               2,               3,
                     4,               5],
      dtype='object', length=1007)
0_B0001          float64
0_B0002          float64
0_B0003          float64
0_B0004          float64
0_B0005          float64
                  ...   
1_Residential    float64
2                 object
3                 object
4                 object
5                 object
Length: 1007, dtype: object
['B' 'A+' 'B' 'B' 'A']
['A+' 'B' 'C' 'C' 'B']
(800, 1007)
(800,)


In [None]:
label_encoder_y = LabelEncoder()
y_train = label_encoder_y.fit_transform(y_train)
y_val = label_encoder_y.transform(y_val)

print(y_train[:5])
print(y_val[:5])

print(y_train.shape)

[2 1 2 2 0]
[1 2 3 3 2]
(800,)


In [None]:
print(X_train.dtypes)
print(X_val.dtypes)

print(X_train.shape)

0_B0001          float64
0_B0002          float64
0_B0003          float64
0_B0004          float64
0_B0005          float64
                  ...   
1_Residential    float64
2                 object
3                 object
4                 object
5                 object
Length: 1007, dtype: object
0_B0001          float64
0_B0002          float64
0_B0003          float64
0_B0004          float64
0_B0005          float64
                  ...   
1_Residential    float64
2                 object
3                 object
4                 object
5                 object
Length: 1007, dtype: object
(800, 1007)


In [None]:
from sklearn.preprocessing import OneHotEncoder

X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)

for col in X_train.select_dtypes(include=['object']).columns:
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    encoded = encoder.fit_transform(X_train[[col]])
    encoded_df_train = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([col]))
    X_train = pd.concat([X_train, encoded_df_train], axis=1)
    X_train = X_train.drop(columns=[col])

    encoded_df_val = encoder.transform(X_val[[col]])
    encoded_df_val = pd.DataFrame(encoded_df_val, columns=encoder.get_feature_names_out([col]))
    X_val = pd.concat([X_val, encoded_df_val], axis=1)
    X_val = X_val.drop(columns=[col])

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")

X_train shape: (800, 3589)
X_val shape: (200, 3589)


In [None]:
print(X_train.shape)
print(y_train.shape)

(800, 3589)
(800,)


#MSE Training

In [None]:
mse_scores = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    mse_scores[name] = mse
    print(f'{name} - Mean Squared Error: {mse}')

Linear Regression - Mean Squared Error: 1.428722183152125
Ridge Regression - Mean Squared Error: 1.402409474867322
Lasso Regression - Mean Squared Error: 1.30555625
ElasticNet Regression - Mean Squared Error: 1.30555625
Decision Tree Regressor - Mean Squared Error: 2.14
Random Forest Regressor - Mean Squared Error: 1.461178
Support Vector Regression - Mean Squared Error: 1.3428427242829357
K-Nearest Neighbors Regressor - Mean Squared Error: 1.469
Gradient Boosting Regressor - Mean Squared Error: 1.3550106075099817
XGBoost Regressor - Mean Squared Error: 1.478626648796255
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000855 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 3
[LightGBM] [Info] Start training from score 1.477500
LightGBM Regressor - M

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Huber Regressor - Mean Squared Error: 1.429825998970041
Bayesian Ridge Regression - Mean Squared Error: 1.328059485184926
Logistic Regression - Mean Squared Error: 2.58
K-Nearest Neighbors Classifier - Mean Squared Error: 2.185
Support Vector Classifier - Mean Squared Error: 2.4
Decision Tree Classifier - Mean Squared Error: 2.755
Random Forest Classifier - Mean Squared Error: 2.445
Gradient Boosting Classifier - Mean Squared Error: 3.15
XGBoost Classifier - Mean Squared Error: 2.23
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 3
[LightGBM] [Info] Start training from score -1.332754
[LightGBM] [Info] Start training from score -1.421922
[LightGBM] [Info] Start training from score -1.391307
[LightGBM] [Info] Start training from score -1.401408
LightGBM Clas



AdaBoost Classifier - Mean Squared Error: 3.215
MLP Classifier - Mean Squared Error: 2.62


LinAlgError: SVD did not converge

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

mse_scores = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)
    mse = mean_squared_error(y_val, y_pred)
    mse_scores[name] = mse
    print(f'{name} - Mean Squared Error: {mse}')

Linear Regression - Mean Squared Error: 1.36406132892197
Ridge Regression - Mean Squared Error: 1.3630061753501723
Lasso Regression - Mean Squared Error: 1.30555625
ElasticNet Regression - Mean Squared Error: 1.30555625
Decision Tree Regressor - Mean Squared Error: 2.09
Random Forest Regressor - Mean Squared Error: 1.4838775000000002
Support Vector Regression - Mean Squared Error: 1.328032263095152
K-Nearest Neighbors Regressor - Mean Squared Error: 1.5084000000000004
Gradient Boosting Regressor - Mean Squared Error: 1.360755435334978
XGBoost Regressor - Mean Squared Error: 1.478626648796255
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 3
[LightGBM] [Info] Start training from score 1.477500
LightGBM Regressor - Mean Squared Error: 1.315862238830025
0:	lea

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Huber Regressor - Mean Squared Error: 1.3628143956413135
Bayesian Ridge Regression - Mean Squared Error: 1.36303574467862
Logistic Regression - Mean Squared Error: 2.72
K-Nearest Neighbors Classifier - Mean Squared Error: 3.075
Support Vector Classifier - Mean Squared Error: 3.335
Decision Tree Classifier - Mean Squared Error: 2.265
Random Forest Classifier - Mean Squared Error: 2.865
Gradient Boosting Classifier - Mean Squared Error: 3.25
XGBoost Classifier - Mean Squared Error: 2.23
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000056 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 3
[LightGBM] [Info] Start training from score -1.332754
[LightGBM] [Info] Start training from score -1.421922
[LightGBM] [Info] Start training from score -1.391307
[LightGBM] [Info] Start training from score -1.401408
LightGBM Cl



AdaBoost Classifier - Mean Squared Error: 3.275
MLP Classifier - Mean Squared Error: 1.595
Linear Discriminant Analysis - Mean Squared Error: 2.82




Quadratic Discriminant Analysis - Mean Squared Error: 2.425
Ridge Classifier - Mean Squared Error: 2.81


In [None]:
best_model_name = min(mse_scores, key=mse_scores.get)
best_model = models[best_model_name]

print(f'The best model is {best_model_name} with an MSE of {mse_scores[best_model_name]}')

The best model is Lasso Regression with an MSE of 1.30555625


#F1 Score

In [None]:
from sklearn.metrics import f1_score

##Weighted

In [None]:
f1_scores_1 = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)

    if y_pred.ndim > 1:
        y_pred = y_pred.argmax(axis=1)
    else:
        y_pred = (y_pred > 0.5).astype(int)

    f1 = f1_score(y_val, y_pred, average='weighted')
    f1_scores_1[name] = f1
    print(f'{name} - F1 Score: {f1}')

Linear Regression - F1 Score: 0.0826530612244898
Ridge Regression - F1 Score: 0.0826530612244898
Lasso Regression - F1 Score: 0.0826530612244898
ElasticNet Regression - F1 Score: 0.0826530612244898
Decision Tree Regressor - F1 Score: 0.12421678179601787
Random Forest Regressor - F1 Score: 0.08967507181615066
Support Vector Regression - F1 Score: 0.0826530612244898
K-Nearest Neighbors Regressor - F1 Score: 0.09569373688942165
Gradient Boosting Regressor - F1 Score: 0.0826530612244898
XGBoost Regressor - F1 Score: 0.08114754098360656
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000064 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 3
[LightGBM] [Info] Start training from score 1.477500
LightGBM Regressor - F1 Score: 0.0826530612244898
0:	learn: 1.1261665	total: 10.2ms	remaining: 10.1s
1:	learn: 1.1259346	total

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Huber Regressor - F1 Score: 0.0826530612244898
Bayesian Ridge Regression - F1 Score: 0.0826530612244898
Logistic Regression - F1 Score: 0.16611614768098804
K-Nearest Neighbors Classifier - F1 Score: 0.15759142857142858
Support Vector Classifier - F1 Score: 0.14887112403100775
Decision Tree Classifier - F1 Score: 0.15847826086956524
Random Forest Classifier - F1 Score: 0.16836601307189544
Gradient Boosting Classifier - F1 Score: 0.188356925087108
XGBoost Classifier - F1 Score: 0.1646434023991276
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000056 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 3
[LightGBM] [Info] Start training from score -1.332754
[LightGBM] [Info] Start training from score -1.421922
[LightGBM] [Info] Start training from score -1.391307
[LightGBM] [Info] Start training from score -1.401408
L



AdaBoost Classifier - F1 Score: 0.15196355085243973
MLP Classifier - F1 Score: 0.19423481686166144
Linear Discriminant Analysis - F1 Score: 0.20141176470588235




Quadratic Discriminant Analysis - F1 Score: 0.13726435935198822
Ridge Classifier - F1 Score: 0.17050465838509318


In [None]:
best_model_name = max(f1_scores_1, key=f1_scores_1.get)
best_model = models[best_model_name]

print(f'The best model is {best_model_name} with an F1 score of {f1_scores_1[best_model_name]}')

The best model is Linear Discriminant Analysis with an F1 score of 0.20141176470588235


##Micro

In [None]:
f1_scores_2 = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)

    if y_pred.ndim > 1:
        y_pred = y_pred.argmax(axis=1)
    else:
        y_pred = (y_pred > 0.5).astype(int)

    f1 = f1_score(y_val, y_pred, average='micro')
    f1_scores_2[name] = f1
    print(f'{name} - F1 Score: {f1}')

Linear Regression - F1 Score: 0.225
Ridge Regression - F1 Score: 0.225
Lasso Regression - F1 Score: 0.225
ElasticNet Regression - F1 Score: 0.225
Decision Tree Regressor - F1 Score: 0.225
Random Forest Regressor - F1 Score: 0.22
Support Vector Regression - F1 Score: 0.225
K-Nearest Neighbors Regressor - F1 Score: 0.215
Gradient Boosting Regressor - F1 Score: 0.225
XGBoost Regressor - F1 Score: 0.22
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 3
[LightGBM] [Info] Start training from score 1.477500
LightGBM Regressor - F1 Score: 0.225
0:	learn: 1.1261665	total: 12.2ms	remaining: 12.2s
1:	learn: 1.1259346	total: 19ms	remaining: 9.47s
2:	learn: 1.1255675	total: 27.5ms	remaining: 9.14s
3:	learn: 1.1251862	total: 36.1ms	remaining: 8.99s
4:	learn: 1.1247165	tot

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Huber Regressor - F1 Score: 0.225
Bayesian Ridge Regression - F1 Score: 0.225
Logistic Regression - F1 Score: 0.245
K-Nearest Neighbors Classifier - F1 Score: 0.3
Support Vector Classifier - F1 Score: 0.3
Decision Tree Classifier - F1 Score: 0.265
Random Forest Classifier - F1 Score: 0.24
Gradient Boosting Classifier - F1 Score: 0.295
XGBoost Classifier - F1 Score: 0.235
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 3
[LightGBM] [Info] Start training from score -1.332754
[LightGBM] [Info] Start training from score -1.421922
[LightGBM] [Info] Start training from score -1.391307
[LightGBM] [Info] Start training from score -1.401408
LightGBM Classifier - F1 Score: 0.26
0:	learn: 1.3853365	total: 29.2ms	remaining: 29.2s
1:	learn: 1.3848831	total: 50.3ms	remai



AdaBoost Classifier - F1 Score: 0.28
MLP Classifier - F1 Score: 0.215
Linear Discriminant Analysis - F1 Score: 0.29




Quadratic Discriminant Analysis - F1 Score: 0.215
Ridge Classifier - F1 Score: 0.25


In [None]:
best_model_name = max(f1_scores_2, key=f1_scores_2.get)
best_model = models[best_model_name]

print(f'The best model is {best_model_name} with an F1 score of {f1_scores_2[best_model_name]}')

The best model is CatBoost Classifier with an F1 score of 0.305


##Macro

In [None]:
f1_scores_3 = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)

    if y_pred.ndim > 1:
        y_pred = y_pred.argmax(axis=1)
    else:
        y_pred = (y_pred > 0.5).astype(int)

    f1 = f1_score(y_val, y_pred, average='macro')
    f1_scores_3[name] = f1
    print(f'{name} - F1 Score: {f1}')

Linear Regression - F1 Score: 0.09183673469387756
Ridge Regression - F1 Score: 0.09183673469387756
Lasso Regression - F1 Score: 0.09183673469387756
ElasticNet Regression - F1 Score: 0.09183673469387756
Decision Tree Regressor - F1 Score: 0.12213970293054997
Random Forest Regressor - F1 Score: 0.09053497942386832
Support Vector Regression - F1 Score: 0.09183673469387756
K-Nearest Neighbors Regressor - F1 Score: 0.101318549595445
Gradient Boosting Regressor - F1 Score: 0.09183673469387756
XGBoost Regressor - F1 Score: 0.09016393442622951
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000085 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 3
[LightGBM] [Info] Start training from score 1.477500
LightGBM Regressor - F1 Score: 0.09183673469387756
0:	learn: 1.1261665	total: 20.2ms	remaining: 20.1s
1:	learn: 1.1259346	

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Huber Regressor - F1 Score: 0.09183673469387756
Bayesian Ridge Regression - F1 Score: 0.09183673469387756
Logistic Regression - F1 Score: 0.1583891735645776
K-Nearest Neighbors Classifier - F1 Score: 0.13385714285714287
Support Vector Classifier - F1 Score: 0.1247577519379845
Decision Tree Classifier - F1 Score: 0.15211049723756906
Random Forest Classifier - F1 Score: 0.16262353998203055
Gradient Boosting Classifier - F1 Score: 0.17936946902654868
XGBoost Classifier - F1 Score: 0.15443838604143947
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000056 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 3
[LightGBM] [Info] Start training from score -1.332754
[LightGBM] [Info] Start training from score -1.421922
[LightGBM] [Info] Start training from score -1.391307
[LightGBM] [Info] Start training from score -1.40140



AdaBoost Classifier - F1 Score: 0.13080540858318634
MLP Classifier - F1 Score: 0.13295194508009153
Linear Discriminant Analysis - F1 Score: 0.18823529411764706




Quadratic Discriminant Analysis - F1 Score: 0.13346833578792341
Ridge Classifier - F1 Score: 0.16239648033126294


In [None]:
best_model_name = max(f1_scores_3, key=f1_scores_3.get)
best_model = models[best_model_name]

print(f'The best model is {best_model_name} with an F1 score of {f1_scores_1[best_model_name]}')

The best model is Linear Discriminant Analysis with an F1 score of 0.20141176470588235


#F1 Scores Testing

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, f1_score, classification_report

In [None]:
data = pd.read_csv("train.csv")

label_encoder = LabelEncoder()
data['Building_Type'] = label_encoder.fit_transform(data['Building_Type'])
data['Energy_Efficiency_Rating'] = label_encoder.fit_transform(data['Energy_Efficiency_Rating'])
data['Building_ID'] = label_encoder.fit_transform(data['Building_ID'])

numerical_columns = [
    'Consumption_Energy_Usage',
    'Consumption_Energy_Ur_Consumption',
    'Occupants',
    'Floor_Area'
]

scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

X = data.drop(columns=['Energy_Efficiency_Rating'])
y = data['Energy_Efficiency_Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    # Regressors
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet Regression': ElasticNet(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Support Vector Regression': SVR(),
    'K-Nearest Neighbors Regressor': KNeighborsRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    'XGBoost Regressor': XGBRegressor(),
    'LightGBM Regressor': LGBMRegressor(),
    'CatBoost Regressor': CatBoostRegressor(learning_rate=0.1, iterations=1000, depth=6, verbose=0),
    'AdaBoost Regressor': AdaBoostRegressor(),
    'Huber Regressor': HuberRegressor(),
    'Bayesian Ridge Regression': BayesianRidge(),

    # Classifiers
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors Classifier': KNeighborsClassifier(),
    'Support Vector Classifier': SVC(),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'Gradient Boosting Classifier': GradientBoostingClassifier(),
    'XGBoost Classifier': XGBClassifier(),
    'LightGBM Classifier': LGBMClassifier(),
    'CatBoost Classifier': CatBoostClassifier(learning_rate=0.1, iterations=1000, depth=6, verbose=0),
    'Naive Bayes Classifier': GaussianNB(),
    'AdaBoost Classifier': AdaBoostClassifier(),
    'MLP Classifier': MLPClassifier(),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis(),
    'Ridge Classifier': RidgeClassifier()
}

from sklearn.metrics import mean_squared_error, f1_score

results = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    if "Regressor" in name:
        mse = mean_squared_error(y_test, y_pred)
        results[name] = {"MSE": mse}

    elif "Classifier" in name:
        f1_weighted = f1_score(y_test, y_pred, average='weighted')
        f1_micro = f1_score(y_test, y_pred, average='micro')
        f1_macro = f1_score(y_test, y_pred, average='macro')

        results[name] = {
            "F1 Score (Weighted)": f1_weighted,
            "F1 Score (Micro)": f1_micro,
            "F1 Score (Macro)": f1_macro
        }

# Model performance summary
print("\nModel Performance Summary:")
for name, metrics in results.items():
    print(f"{name}: {metrics}")

Training Linear Regression...
Training Ridge Regression...
Training Lasso Regression...
Training ElasticNet Regression...
Training Decision Tree Regressor...
Training Random Forest Regressor...
Training Support Vector Regression...
Training K-Nearest Neighbors Regressor...
Training Gradient Boosting Regressor...
Training XGBoost Regressor...
Training LightGBM Regressor...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000330 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1258
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 6
[LightGBM] [Info] Start training from score 1.477500
Training CatBoost Regressor...
Training AdaBoost Regressor...
Training Huber Regressor...
Training Bayesian Ridge Regression...
Training Logistic Regression...
Training K-Nearest Neighbors Classifier...
Training Support Vector Classifier...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Decision Tree Classifier...
Training Random Forest Classifier...
Training Gradient Boosting Classifier...
Training XGBoost Classifier...
Training LightGBM Classifier...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000129 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1258
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 6
[LightGBM] [Info] Start training from score -1.332754
[LightGBM] [Info] Start training from score -1.421922
[LightGBM] [Info] Start training from score -1.391307
[LightGBM] [Info] Start training from score -1.401408
Training CatBoost Classifier...
Training Naive Bayes Classifier...
Training AdaBoost Classifier...




Training MLP Classifier...
Training Linear Discriminant Analysis...
Training Quadratic Discriminant Analysis...
Training Ridge Classifier...

Model Performance Summary:
Decision Tree Regressor: {'MSE': 2.875}
Random Forest Regressor: {'MSE': 1.4400775}
K-Nearest Neighbors Regressor: {'MSE': 1.5278}
Gradient Boosting Regressor: {'MSE': 1.4020450886635356}
XGBoost Regressor: {'MSE': 1.7235814366210598}
LightGBM Regressor: {'MSE': 1.70337599027466}
CatBoost Regressor: {'MSE': 1.641069144045738}
AdaBoost Regressor: {'MSE': 1.2948862451964276}
Huber Regressor: {'MSE': 1.2930947619003959}
K-Nearest Neighbors Classifier: {'F1 Score (Weighted)': 0.30145399917558413, 'F1 Score (Micro)': 0.305, 'F1 Score (Macro)': 0.295014126870227}
Support Vector Classifier: {'F1 Score (Weighted)': 0.253975780851981, 'F1 Score (Micro)': 0.31, 'F1 Score (Macro)': 0.23914253242936082}
Decision Tree Classifier: {'F1 Score (Weighted)': 0.1922273703911199, 'F1 Score (Micro)': 0.195, 'F1 Score (Macro)': 0.19012602274

#Predicting Test


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

data = pd.read_csv("train.csv")

label_encoder = LabelEncoder()
data['Building_Type'] = label_encoder.fit_transform(data['Building_Type'])
data['Energy_Efficiency_Rating'] = label_encoder.fit_transform(data['Energy_Efficiency_Rating'])
data['Building_ID'] = label_encoder.fit_transform(data['Building_ID'])

numerical_columns = [
    'Consumption_Energy_Usage',
    'Consumption_Energy_Ur_Consumption',
    'Occupants',
    'Floor_Area'
]

scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

X = data.drop(columns=['Energy_Efficiency_Rating'])
y = data['Energy_Efficiency_Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

test_data = pd.read_csv("test.csv")

test_data['Building_Type'] = label_encoder.transform(test_data['Building_Type'])
test_data['Building_ID'] = label_encoder.transform(test_data['Building_ID'])

test_data[numerical_columns] = scaler.transform(test_data[numerical_columns])

X_test_final = test_data.drop(columns=['Energy_Efficiency_Rating'])

y_pred = knn.predict(X_test_final)

test_data['Predictions'] = y_pred

test_data.to_csv("test_predictions.csv", index=False)

print(test_data[['Predictions']].head())

ValueError: y contains previously unseen labels: 'Residential'

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

data = pd.read_csv("train.csv")

X = data.drop(columns=['Energy_Efficiency_Rating'])
y = data['Energy_Efficiency_Rating']

categorical_columns = ['Building_Type', 'Building_ID']
numerical_columns = [
    'Consumption_Energy_Usage',
    'Consumption_Energy_Ur_Consumption',
    'Occupants',
    'Floor_Area'
]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

f1_weighted = f1_score(y_test, y_pred, average='weighted')
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')

print(f"F1 Score (Weighted): {f1_weighted}")
print(f"F1 Score (Micro): {f1_micro}")
print(f"F1 Score (Macro): {f1_macro}")

test_data = pd.read_csv("test.csv")

X_test_final = test_data.drop(columns=[col for col in test_data.columns if col == 'Energy_Efficiency_Rating'], errors='ignore')

y_pred_final = model.predict(X_test_final)

test_data['Energy_Efficiency_Rating'] = y_pred_final

test_data.to_csv("test_predictions1.csv", index=False)

print(test_data[['Energy_Efficiency_Rating']].head())

F1 Score (Weighted): 0.27129392779590034
F1 Score (Micro): 0.27
F1 Score (Macro): 0.26952760222627087
  Energy_Efficiency_Rating
0                        A
1                        A
2                        A
3                        A
4                        A


In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

data = pd.read_csv("train.csv")

X = data.drop(columns=['Energy_Efficiency_Rating'])
y = data['Energy_Efficiency_Rating']

categorical_columns = ['Building_Type', 'Building_ID']
numerical_columns = [
    'Consumption_Energy_Usage',
    'Consumption_Energy_Ur_Consumption',
    'Occupants',
    'Floor_Area'
]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=10))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

f1_weighted = f1_score(y_test, y_pred, average='weighted')
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')

print(f"F1 Score (Weighted): {f1_weighted}")
print(f"F1 Score (Micro): {f1_micro}")
print(f"F1 Score (Macro): {f1_macro}")

test_data = pd.read_csv("test.csv")

X_test_final = test_data.drop(columns=[col for col in test_data.columns if col == 'Energy_Efficiency_Rating'], errors='ignore')

y_pred_final = model.predict(X_test_final)

test_data['Energy_Efficiency_Rating'] = y_pred_final

test_data.to_csv("test_predictions2.csv", index=False)

print(test_data[['Energy_Efficiency_Rating']].head())

F1 Score (Weighted): 0.25915092943885076
F1 Score (Micro): 0.26
F1 Score (Macro): 0.258496177667526
  Energy_Efficiency_Rating
0                        A
1                        A
2                        A
3                       A+
4                       A+


In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

data = pd.read_csv("train.csv")

label_encoder = LabelEncoder()
label_encoder_buildings = LabelEncoder()
data['Building_Type'] = label_encoder.fit_transform(data['Building_Type'])
data['Energy_Efficiency_Rating'] = label_encoder.fit_transform(data['Energy_Efficiency_Rating'])
data['Building_ID'] = label_encoder_buildings.fit_transform(data['Building_ID'])
data['Energy_Usage_per_Floor_Area'] = data['Consumption_Energy_Usage'] / (data['Floor_Area'] + 1e-9)
data['Occupants_per_Floor_Area'] = data['Occupants'] / (data['Floor_Area'] + 1e-9)
data['Log_Energy_Usage'] = np.log1p(data['Consumption_Energy_Usage'])
data['Square_Floor_Area'] = data['Floor_Area'] ** 2

X = data.drop(columns=['Energy_Efficiency_Rating'])
y = data['Energy_Efficiency_Rating']

categorical_columns = ['Building_Type', 'Building_ID']
numerical_columns = [
    'Consumption_Energy_Usage',
    'Consumption_Energy_Ur_Consumption',
    'Occupants',
    'Floor_Area'
]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

f1_weighted = f1_score(y_test, y_pred, average='weighted')
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')

print(f"F1 Score (Weighted): {f1_weighted}")
print(f"F1 Score (Micro): {f1_micro}")
print(f"F1 Score (Macro): {f1_macro}")

test_data = pd.read_csv("test.csv")

X_test_final = test_data.drop(columns=[col for col in test_data.columns if col == 'Energy_Efficiency_Rating'], errors='ignore')

y_pred_final = model.predict(X_test_final)
print(y_pred_final)
decoded_predictions = label_encoder.inverse_transform(y_pred_final)

test_data['Energy_Efficiency_Rating'] = decoded_predictions
test_data.to_csv("test_predictions3.csv", index=False)
print(test_data[['Energy_Efficiency_Rating']].head())

F1 Score (Weighted): 0.27129392779590034
F1 Score (Micro): 0.27
F1 Score (Macro): 0.26952760222627087
[0 0 2 0 1 1 2 2 0 1 0 3 0 0 2 1 0 1 0 3 0 2 2 2 0 3 0 1 2 2 2 3 2 0 1 1 0
 0 1 1 2 3 2 1 0 1 3 1 3 2 2 0 3 2 2 1 1 2 2 1 1 1 0 0 3 0 1 0 2 2 1 3 2 2
 0 0 1 0 0 0 1 0 3 0 2 0 2 0 1 0 2 0 1 1 2 2 2 3 1 0 3 3 3 2 1 2 3 1 1 1 0
 3 3 2 3 1 0 2 2 2 1 0 0 0 2 0 3 0 0 0 3 0 0 3 3 0 1 2 1 3 1 3 1 2 0 2 3 1
 0 1 2 3 2 0 2 0 0 0 0 0 0 0 1 2 0 0 0 3 3 0 1 3 0 2 1 2 0 1 0 0 2 3 0 2 0
 0 0 0 1 1 0 3 1 1 2 0 2 0 1 1 0 0 2 0 0 3 0 1 3 0 2 1 0 0 0 0 1 2 1 0 1 1
 1 3 3 0 2 3 3 0 2 3 0 1 0 0 1 2 1 0 2 1 0 0 1 2 1 0 0 3 0 0 3 2 0 2 0 2 2
 0 1 3 0 2 3 0 1 1 0 0 3 0 3 1 3 0 2 1 2 2 3 3 0 0 0 2 0 1 2 2 0 1 1 0 0 0
 1 0 2 2 1 1 2 2 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 3 0 0 0 1 0 1 1 2 3 0 2 0 3
 0 0 2 0 3 2 3 1 3 1 2 1 0 0 2 1 1 0 1 2 3 0 1 0 0 0 0 1 1 0 1 0 0 1 0 2 2
 2 2 0 1 0 3 3 2 1 0 2 0 2 1 2 1 3 0 1 0 2 2 0 0 1 0 1 0 0 2 1 2 0 2 1 3 1
 1 1 2 0 3 3 1 1 2 2 3 2 3 0 0 2 1 1 0 0 2 1 0 2 1 0 0 2 2 2 1 3 2 1 1 1 

#Suggestions

In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import login
login("")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/QwQ-32B-Preview"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = "How many r in strawberry."
messages = [
    {"role": "system", "content": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

Downloading shards:   0%|          | 0/17 [00:00<?, ?it/s]

model-00014-of-00017.safetensors:  64%|######3   | 2.49G/3.90G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
import openai
openai.api_key = "  "

In [None]:
Building_ID = "B0001"
Building_Type = "Commercial"
Consumption_Energy_Usage = 	332.15
Consumption_Energy_Ur_Consumption = 	37.44
Occupants = 	16
Floor_Area = 	2930
Energy_Efficiency_Rating = "B"

In [None]:
def ask_openai(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message['content'].strip()

user_prompt = '''
We are working on improving the energy efficiency of buildings in order to contribute to SDG-7: Affordable and Clean Energy. Below are the details for different buildings:

1. Building ID: {Building_ID}
2. Building Type: {Building_Type}
3. Consumption Energy Usage (kWh): {Consumption_Energy_Usage}
4. Consumption Energy per Occupant/Unit Area: {Consumption_Energy_Ur_Consumption}
5. Occupants: {Occupants}
6. Floor Area (sqm): {Floor_Area}
7. Energy Efficiency Rating: {Energy_Efficiency_Rating}

Please provide the following:
1. Actionable recommendations for each building to reduce energy consumption and become more energy-efficient.
2. Suggestions to enhance the use of renewable energy in these buildings.
3. How can these buildings contribute to SDG-4 by educating occupants on energy-saving practices and fostering a culture of sustainability??
'''
response = ask_openai(user_prompt)
print(response)

Sure! Below are actionable recommendations to improve energy efficiency for buildings, enhance renewable energy use, and contribute to SDG-4 through education.

### Building Recommendations

**1. Actionable Recommendations for Energy Efficiency:**

For each building, I will suggest generalized recommendations. You can personalize them based on specific data for each building type and usage patterns.

- **Building ID: {Building_ID}**
  
  - **Upgrade Insulation:** Improve insulation in walls, roofs, and windows to minimize heat loss or gain, reducing heating and cooling energy needs.
  
  - **Energy-Efficient Lighting:** Replace incandescent bulbs with LED lighting, which consumes significantly less energy and has a longer lifespan.
  
  - **Smart Thermostats:** Install programmable or smart thermostats that optimize heating and cooling schedules based on occupancy and usage patterns.
  
  - **Energy Audits:** Conduct regular energy audits to identify inefficiencies and areas for improv

## ML MODEL

In [None]:
data2 = pd.read_csv("/content/test (4).csv")
data2=data2.drop(columns=['Usage'])
print(data2)

label_encoder2 = LabelEncoder()
label_encoder_buildings2 = LabelEncoder()
data2['Building_Type'] = label_encoder2.fit_transform(data2['Building_Type'])
data2['Building_ID'] = label_encoder_buildings2.fit_transform(data2['Building_ID'])
data2['Energy_Usage_per_Floor_Area'] = data2['Consumption_Energy_Usage'] / (data2['Floor_Area'] + 1e-9)
data2['Occupants_per_Floor_Area'] = data2['Occupants'] / (data2['Floor_Area'] + 1e-9)
data2['Log_Energy_Usage'] = np.log1p(data2['Consumption_Energy_Usage'])
data2['Square_Floor_Area'] = data2['Floor_Area'] ** 2

numerical_columns = [
    'Consumption_Energy_Usage',
    'Consumption_Energy_Ur_Consumption',
    'Occupants',
    'Floor_Area'
]
scaler = StandardScaler()
data2[numerical_columns] = scaler.fit_transform(data2[numerical_columns])
print(data2)

y_pred2 = knn.predict(data2)


print("Predictions from K-Nearest Neighbors Classifier:")
print(y_pred2)
decoded_predictions = label_encoder.inverse_transform(y_pred2)
print("Decoded Predictions from K-Nearest Neighbors Classifier:")
print(decoded_predictions)




    Building_ID Building_Type  Consumption_Energy_Usage  \
0         T0362   Residential                    264.26   
1         T0074    Commercial                    267.24   
2         T0375   Residential                    284.99   
3         T0156   Residential                    487.26   
4         T0105    Industrial                    278.81   
..          ...           ...                       ...   
495       T0107    Commercial                    202.43   
496       T0271    Commercial                    207.60   
497       T0349    Commercial                    411.85   
498       T0436    Industrial                    141.49   
499       T0103    Commercial                    203.42   

     Consumption_Energy_Ur_Consumption  Occupants  Floor_Area  
0                               109.58         31        3257  
1                                22.76        205        2101  
2                               137.80         53        4052  
3                               119

In [None]:
import pandas as pd


df = pd.read_csv('/content/test_predictions.csv')


df['Energy_Efficiency_Rating'] = decoded_predictions


df.to_csv('updated_file.csv', index=False)
