In [None]:
pip install openpyxl

In [None]:
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import StandardScaler



file_path = 'nasa93dataset.csv'
dataset = pd.read_csv(file_path)


X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X

In [None]:
import numpy as np

def calculate_mmre(actuals, predictions):
    actuals = np.array(actuals)
    predictions = np.array(predictions)
    relative_errors = np.abs((actuals - predictions) / np.where(actuals == 0, 1e-10, actuals))
    return np.mean(relative_errors)

def calculate_bmmre(actuals, all_predictions):
    actuals = np.array(actuals)
    all_predictions = np.array(all_predictions)
    relative_errors = [np.abs((actuals - pred) / np.where(actuals == 0, 1e-10, actuals)) for pred in all_predictions]
    min_relative_errors = np.min(relative_errors, axis=0)
    return np.mean(min_relative_errors)

def pred_25(y_true, y_pred):

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    percentage_error = np.abs(y_true - y_pred) / y_true
    within_25_percent = np.sum(percentage_error <= 0.25)

    return within_25_percent / len(y_true)

In [None]:
def calculate_metrics(actuals, predictions_list):
    maes = []
    pred25 = []
    mmre = []

    for predictions in predictions_list:
        maes.append(mean_absolute_error(actuals, predictions))
        pred25.append(pred_25(actuals, predictions))
        mmre.append(calculate_mmre(actuals, predictions))

    return maes, pred25,mmre

In [None]:
import pandas as pd

file_name = f"{file_path}.xlsx"

pd.DataFrame().to_excel(file_name,index=False)

In [None]:
def save_to_excel(file_name, name, ensemble_df, individual_df):
    max_rows = max(len(ensemble_df), len(individual_df))
    ensemble_df = ensemble_df.reindex(range(max_rows))
    individual_df = individual_df.reindex(range(max_rows))

    separator_column = pd.Series([None] * max_rows, name="")

    df_combined = pd.concat([ensemble_df, separator_column, individual_df], axis=1)

    with pd.ExcelWriter(file_name, mode="a", engine="openpyxl") as writer:
        df_combined.to_excel(writer, sheet_name=f"{name}", index=False)

In [None]:
def display_results(ensemble_models, individual_models, ensemble_mae, ensemble_pred25,ensemble_mmre,individual_mae, individual_pred25,individual_mmre,ensemble_bmmre):

    ensemble_df = pd.DataFrame({
        "Ensemble Models with": ensemble_models,
        "MAE": ensemble_mae,
        "MMRE": ensemble_mmre,
        "BMMRE": ensemble_bmmre,
        "PRED(25)": ensemble_pred25
    })

    individual_df = pd.DataFrame({
        "Individual Model": individual_models,
        "MAE": individual_mae,
        "MMRE": individual_mmre,
        "PRED(25)": individual_pred25
    })


    print("Ensemble Models:\n")
    print(ensemble_df.to_string(index=False))

    print("\n------------------------------------------------------")

    print("\nIndividual Models:\n")
    print(individual_df.to_string(index=False))

    return ensemble_df, individual_df

**CBR**

In [None]:
from scipy.spatial.distance import cdist

class CBR:
  def __init__(self):
    self.best_weights = None
    self.best_k = None

  def fit(self,X_train,y_train):
    self.X = X_train
    self.y = y_train
    self.best_weights = np.random.rand(X_train.shape[1])
    self.best_weights /= self.best_weights.sum()
    self.best_k = random.randint(1, 10)

  def predict_effort(self,X_train, y_train, X_test, weights, k):

    weighted_X_train = X_train * weights
    weighted_X_test = X_test * weights

    distances = cdist(weighted_X_test, weighted_X_train, metric='euclidean')

    predictions = []
    for dist in distances:
        neighbors = np.argsort(dist)[:k]

        ranking_weights = [(2 ** (k - i - 1)) / (2 ** k - 1) for i in range(k)]
        predicted_effort = sum(ranking_weights[i] * y_train[neighbors[i]] for i in range(k))
        predictions.append(predicted_effort)

    return np.array(predictions)

  def predict(self,X_test):
    return self.predict_effort(self.X, self.y, X_test, self.best_weights, self.best_k)


**COCOMO**

In [None]:
import tensorflow as tf

# COCOMO base equation: Effort = a*(KLOC^b)*EAF
class COCOMO(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.a = tf.Variable(2.94, trainable=True)  # Initial COCOMO II organic a
        self.b = tf.Variable(1.12, trainable=True)  # Initial COCOMO II organic b
        self.nn = tf.keras.Sequential([
            tf.keras.layers.Dense(32, activation='relu', input_shape=(14,)),
            tf.keras.layers.Dense(1, activation='linear')
        ])

    def call(self, inputs):
        kloc = inputs[:, 15]  # LOC column
        eaf = tf.reduce_prod(inputs[:, :15], axis=1)  # Product of cost drivers
        base_effort = self.a * (kloc ** self.b) * eaf
        correction = self.nn(inputs[:, :15])
        return base_effort * (1 + correction)

    def predict(self, X):
        return self.call(X).numpy().flatten()

**Combined models**

In [None]:
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

# For ensemble model combination rule
def linear_combination(y1, y2, y3, weights):
    return weights[0] * y1 + weights[1] * y2 + weights[2] * y3

def median_combination(y1, y2, y3):
    return np.median(np.stack([y1, y2, y3]), axis=0)

In [None]:
ensemble_models = ["ANN", "KNN", "XGBooster", "SVR"]
individual_models = ["CBR", "COCOMO", "ANN", "KNN", "XGBooster", "SVR"]

**loocv - scaled- median**

In [None]:
loo = LeaveOneOut()
predictions = []
actuals = []

combined_predictions_1=[]
combined_predictions_2=[]
combined_predictions_3=[]
combined_predictions_4=[]

model_1 = CBR()
model_2 = COCOMO()
model_3 = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, learning_rate_init=0.01, activation='relu', solver='adam', random_state=42, tol=1e-4)
model_4 = KNeighborsRegressor(n_neighbors=5)
model_5 = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model_6 = SVR(kernel='linear', C=100, gamma=0.1, epsilon=0.1)


predictions1=[]
predictions2=[]
predictions3=[]
predictions4=[]
predictions5=[]
predictions6=[]


k = 1



for train_index, test_index in loo.split(X_scaled):

    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model_1.fit(X_train, y_train)
    model_2.compile(optimizer='adam', loss='mae')
    model_2.fit(X_train, y_train, epochs=500, batch_size=16)
    model_3.fit(X_train, y_train)
    model_4.fit(X_train, y_train)
    model_5.fit(X_train, y_train)
    model_6.fit(X_train, y_train)

    y_pred_1 = model_1.predict(X_test)
    y_pred_2 = model_2.predict(X[test_index])
    y_pred_3 = model_3.predict(X_test)
    y_pred_4 = model_4.predict(X_test)
    y_pred_5 = model_5.predict(X_test)
    y_pred_6 = model_6.predict(X_test)

    y_pred_combined_1 = median_combination(y_pred_1, y_pred_2, y_pred_3)
    y_pred_combined_2 = median_combination(y_pred_1, y_pred_2, y_pred_4)
    y_pred_combined_3 = median_combination(y_pred_1, y_pred_2, y_pred_5)
    y_pred_combined_4 = median_combination(y_pred_1, y_pred_2, y_pred_6)

    combined_predictions_1.extend(y_pred_combined_1)
    combined_predictions_2.extend(y_pred_combined_2)
    combined_predictions_3.extend(y_pred_combined_3)
    combined_predictions_4.extend(y_pred_combined_4)

    actuals.extend(y_test)

    predictions1.extend(y_pred_1)
    predictions2.extend(y_pred_2)
    predictions3.extend(y_pred_3)
    predictions4.extend(y_pred_4)
    predictions5.extend(y_pred_5)
    predictions6.extend(y_pred_6)

    k = k + 1

print(f"Execution time: {execution_time} seconds")
ensemble_predictions = [combined_predictions_1, combined_predictions_2, combined_predictions_3, combined_predictions_4]
individual_predictions = [predictions1, predictions2, predictions3, predictions4, predictions5, predictions6]

ensemble_mae, ensemble_pred25, ensemble_mmre = calculate_metrics(actuals, ensemble_predictions)
individual_mae, individual_pred25, individual_mmre = calculate_metrics(actuals, individual_predictions)

bmmre = []
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions3]))
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions4]))
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions5]))
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions6]))

ensemble_df, individual_df = display_results(ensemble_models, individual_models, ensemble_mae, ensemble_pred25, ensemble_mmre,individual_mae, individual_pred25,individual_mmre,bmmre)


save_to_excel(file_name, "loocv-scaled-median", ensemble_df, individual_df)

In [None]:
ensemble_predictions = [combined_predictions_1, combined_predictions_2, combined_predictions_3, combined_predictions_4]
individual_predictions = [predictions1, predictions2, predictions3, predictions4, predictions5, predictions6]

ensemble_mae, ensemble_pred25 = calculate_metrics(actuals, ensemble_predictions)
individual_mae, individual_pred25 = calculate_metrics(actuals, individual_predictions)

**loocv - unscaled - median**

In [None]:
from sklearn.model_selection import KFold
loocv=LeaveOneOut()

predictions = []
actuals = []

combined_predictions_1=[]
combined_predictions_2=[]
combined_predictions_3=[]
combined_predictions_4=[]

model_1 = CBR()
model_2 = COCOMO()
model_3 = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, learning_rate_init=0.01, activation='relu', solver='adam', random_state=42)
model_4 = KNeighborsRegressor(n_neighbors=5)
model_5 = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model_6 = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)



predictions1=[]
predictions2=[]
predictions3=[]
predictions4=[]
predictions5=[]
predictions6=[]



k = 1

for train_index, test_index in loocv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


    model_1.fit(X_train, y_train)
    model_2.fit()
    model_3.fit(X_train, y_train)
    model_4.fit(X_train, y_train)
    model_5.fit(X_train, y_train)
    model_6.fit(X_train, y_train)



    y_pred_1 = model_1.predict(X_test)
    y_pred_2 = model_2.predict(X_test)
    y_pred_3 = model_3.predict(X_test)
    y_pred_4 = model_4.predict(X_test)
    y_pred_5 = model_5.predict(X_test)
    y_pred_6 = model_6.predict(X_test)

    y_pred_combined_1 = median_combination(y_pred_1, y_pred_2, y_pred_3)
    y_pred_combined_2 = median_combination(y_pred_1, y_pred_2, y_pred_4)
    y_pred_combined_3 = median_combination(y_pred_1, y_pred_2, y_pred_5)
    y_pred_combined_4 = median_combination(y_pred_1, y_pred_2, y_pred_6)


    combined_predictions_1.extend(y_pred_combined_1)
    combined_predictions_2.extend(y_pred_combined_2)
    combined_predictions_3.extend(y_pred_combined_3)
    combined_predictions_4.extend(y_pred_combined_4)

    actuals.extend(y_test)

    predictions1.extend(y_pred_1)
    predictions2.extend(y_pred_2)
    predictions3.extend(y_pred_3)
    predictions4.extend(y_pred_4)
    predictions5.extend(y_pred_5)
    predictions6.extend(y_pred_6)

    k = k + 1

ensemble_predictions = [combined_predictions_1, combined_predictions_2, combined_predictions_3, combined_predictions_4]
individual_predictions = [predictions1, predictions2, predictions3, predictions4, predictions5, predictions6]

ensemble_mae, ensemble_pred25, ensemble_mmre = calculate_metrics(actuals, ensemble_predictions)
individual_mae, individual_pred25, individual_mmre = calculate_metrics(actuals, individual_predictions)

bmmre = []
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions3]))
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions4]))
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions5]))
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions6]))

ensemble_df, individual_df = display_results(ensemble_models, individual_models, ensemble_mae, ensemble_pred25, ensemble_mmre,individual_mae, individual_pred25,individual_mmre,bmmre)

save_to_excel(file_name,  "loocv-unscaled-median", ensemble_df, individual_df)

**K-Fold - unscaled - median**

In [None]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=3, shuffle=True, random_state=42)

predictions = []
actuals = []

combined_predictions_1=[]
combined_predictions_2=[]
combined_predictions_3=[]
combined_predictions_4=[]

model_1 = CBR()
model_2 = COCOMO()
model_3 = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, learning_rate_init=0.01, activation='relu', solver='adam', random_state=42)
model_4 = KNeighborsRegressor(n_neighbors=5)
model_5 = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model_6 = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)



predictions1=[]
predictions2=[]
predictions3=[]
predictions4=[]
predictions5=[]
predictions6=[]



k = 1

for train_index, test_index in kfold.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model_1.fit(X_train, y_train)
    model_2.fit()
    model_3.fit(X_train, y_train)
    model_4.fit(X_train, y_train)
    model_5.fit(X_train, y_train)
    model_6.fit(X_train, y_train)

    y_pred_1 = model_1.predict(X_test)
    y_pred_2 = model_2.predict(X_test)
    y_pred_3 = model_3.predict(X_test)
    y_pred_4 = model_4.predict(X_test)
    y_pred_5 = model_5.predict(X_test)
    y_pred_6 = model_6.predict(X_test)

    y_pred_combined_1 = median_combination(y_pred_1, y_pred_2, y_pred_3)
    y_pred_combined_2 = median_combination(y_pred_1, y_pred_2, y_pred_4)
    y_pred_combined_3 = median_combination(y_pred_1, y_pred_2, y_pred_5)
    y_pred_combined_4 = median_combination(y_pred_1, y_pred_2, y_pred_6)

    combined_predictions_1.extend(y_pred_combined_1)
    combined_predictions_2.extend(y_pred_combined_2)
    combined_predictions_3.extend(y_pred_combined_3)
    combined_predictions_4.extend(y_pred_combined_4)

    actuals.extend(y_test)

    predictions1.extend(y_pred_1)
    predictions2.extend(y_pred_2)
    predictions3.extend(y_pred_3)
    predictions4.extend(y_pred_4)
    predictions5.extend(y_pred_5)
    predictions6.extend(y_pred_6)

    k = k + 1

ensemble_predictions = [combined_predictions_1, combined_predictions_2, combined_predictions_3, combined_predictions_4]
individual_predictions = [predictions1, predictions2, predictions3, predictions4, predictions5, predictions6]

ensemble_mae, ensemble_pred25, ensemble_mmre = calculate_metrics(actuals, ensemble_predictions)
individual_mae, individual_pred25, individual_mmre = calculate_metrics(actuals, individual_predictions)

bmmre = []
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions3]))
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions4]))
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions5]))
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions6]))

ensemble_df, individual_df = display_results(ensemble_models, individual_models, ensemble_mae, ensemble_pred25, ensemble_mmre,individual_mae, individual_pred25,individual_mmre,bmmre)

save_to_excel(file_name,"kfold-unscaled-median", ensemble_df, individual_df)

**kfold - unscaled - linear combination**

In [None]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

predictions = []
actuals = []

combined_predictions_1=[]
combined_predictions_2=[]
combined_predictions_3=[]
combined_predictions_4=[]

model_1 = CBR()
model_2 = COCOMO()
model_3 = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, learning_rate_init=0.01, activation='relu', solver='adam', random_state=42)
model_4 = KNeighborsRegressor(n_neighbors=5)
model_5 = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model_6 = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)




predictions1=[]
predictions2=[]
predictions3=[]
predictions4=[]
predictions5=[]
predictions6=[]



k = 1

for train_index, test_index in kfold.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model_1.fit(X_train, y_train)
    model_2.fit()
    model_3.fit(X_train, y_train)
    model_4.fit(X_train, y_train)
    model_5.fit(X_train, y_train)
    model_6.fit(X_train, y_train)

    y_pred_1 = model_1.predict(X_test)
    y_pred_2 = model_2.predict(X_test)
    y_pred_3 = model_3.predict(X_test)
    y_pred_4 = model_4.predict(X_test)
    y_pred_5 = model_5.predict(X_test)
    y_pred_6 = model_6.predict(X_test)

    y_pred_combined_1 = linear_combination(y_pred_1, y_pred_2, y_pred_3,[0.4,0.3,0.3])
    y_pred_combined_2 = linear_combination(y_pred_1, y_pred_2, y_pred_4,[0.4,0.3,0.3])
    y_pred_combined_3 = linear_combination(y_pred_1, y_pred_2, y_pred_5,[0.4,0.3,0.3])
    y_pred_combined_4 = linear_combination(y_pred_1, y_pred_2, y_pred_6,[0.4,0.3,0.3])

    combined_predictions_1.extend(y_pred_combined_1)
    combined_predictions_2.extend(y_pred_combined_2)
    combined_predictions_3.extend(y_pred_combined_3)
    combined_predictions_4.extend(y_pred_combined_4)

    actuals.extend(y_test)

    predictions1.extend(y_pred_1)
    predictions2.extend(y_pred_2)
    predictions3.extend(y_pred_3)
    predictions4.extend(y_pred_4)
    predictions5.extend(y_pred_5)
    predictions6.extend(y_pred_6)

    k = k + 1



ensemble_predictions = [combined_predictions_1, combined_predictions_2, combined_predictions_3, combined_predictions_4]
individual_predictions = [predictions1, predictions2, predictions3, predictions4, predictions5, predictions6]

ensemble_mae, ensemble_pred25, ensemble_mmre = calculate_metrics(actuals, ensemble_predictions)
individual_mae, individual_pred25, individual_mmre = calculate_metrics(actuals, individual_predictions)

bmmre = []
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions3]))
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions4]))
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions5]))
bmmre.append(calculate_bmmre(actuals, [predictions1,predictions2,predictions6]))

ensemble_df, individual_df = display_results(ensemble_models, individual_models, ensemble_mae, ensemble_pred25, ensemble_mmre,individual_mae, individual_pred25,individual_mmre,bmmre)

save_to_excel(file_name,"kfold-unscaled-linear", ensemble_df, individual_df)