# Installing dependencies

In [None]:
!pip install boruta

# Importing libraries and frameworks

In [2]:
# General python libraries to arrange dataset and plot graphs
import pandas as pd
import numpy as np
from numpy import cov
from numpy import loadtxt
import matplotlib.pyplot as plt
from matplotlib import pyplot

# Data pre-processing and splitting
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

# Downloading images
from google.colab import files

# Measure of accuracy
from sklearn.metrics import mean_squared_error as MSE
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score
from sklearn.preprocessing import scale 

# Importing Boruta for feature selection
from boruta import BorutaPy

# Importing the Random Forest libraries for regression and feature selection
from sklearn.ensemble import RandomForestRegressor

# Importing the LASSO libraries for regression
from sklearn.linear_model import Lasso, LassoCV

# ANN for regression
import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

# Importing the dataset

In [3]:
# Import dataset
df = pd.read_csv("/content/sample_data/dataset.csv")

# Pre-processing

In [None]:
# Remove the 'Name' column from the dataset
df = df[df.columns[1:17]]
print(df.describe)
print(df.dtypes)

# Change categorical size to numerical values using One-hot encoding
obj_df = df.select_dtypes(include=['object']).copy()
ord_enc = OrdinalEncoder()
obj_df["SizeNumerical"] = ord_enc.fit_transform(obj_df[["Size"]])
obj_df[["Size", "SizeNumerical"]].head(11)

# Remove the Size column now
del obj_df['Size']
del df['Size']

# Insert the new column size into data frame
df.insert(loc=1, column='Size', value=obj_df)

# Correlation

In [None]:
# Correlation plot to show the interrelation between conditional variables and Exe-time 
corr = df.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(df.columns))
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(df.columns)
ax.set_yticklabels(df.columns)

# Saving image and displaying the graph
# plt.savefig("Correlation.png",bbox_inches = 'tight')
# files.download("Correlation.png")
plt.show()

# The Spearman correlation can evaluate a monotonic relationship between two variables — Continous or Ordinal
# It is based on the ranked values for each variable rather than the raw data.
corrrelation = df.corr(method="spearman");
print("Spearman rank correlation:");
print(corrrelation);


# Conditional and output variables

In [6]:
# Split data into X(Conditional variables) and Y(Output variable)
X = df.iloc[:, 0:15 ]

# Execution time is the output variable. If we can predict the exe-time, we can predict the speedup.
Y = df.loc[:, ['Exe-time']]

# Feature Extraction

In [None]:
# Random forest feature selection
model = RandomForestRegressor()
model.fit(X,Y['Exe-time'])

# Getting the top 5 features
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(5).plot(kind='barh')

In [None]:
# Define Boruta feature selection method
feat_selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=1)

# Find all relevant features
feat_selector.fit(X.values, Y.iloc[:, 0])

# Check selected features
feat_selector.support_

# Check ranking of features
feat_selector.ranking_

# Call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X.values)

# Zip column names, ranks, and decisions in a single iterable
feature_ranks = list(zip(df.columns, 
                         feat_selector.ranking_, 
                         feat_selector.support_))

# Iterate through and print out the results
for feat in feature_ranks:
    print('Feature: {:<25} Rank: {},  Keep: {}'.format(feat[0], feat[1], feat[2]))

# Testing and training data

In [9]:
# Random seed value for shuffling
seed = 5

# 40% test data and 60% training data
test_size = 0.4

# Prepare testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Regression

In [None]:
# Random forest Regression
model = RandomForestRegressor()
model.fit(X_train,y_train)

# Get the mean absolute error on the testing data
# Mean Absolute Error is the measure of the difference between the two continuous variables. The MAE is the average vertical distance between each actual value 
# and the line that best matches the data. MAE is also the average horizontal distance between each data point and the best matching line.
predicted_exe_time = model.predict(X_test)
MAE = mae(y_test , predicted_exe_time)
print('Random forest validation MAE: ', MAE)

In [None]:
# LASSO Regression
lasso = Lasso(max_iter = 10000, normalize = True)

#Setting the alpha value using cross-validation Lasso
lassocv = LassoCV(alphas = None, cv = 10, max_iter = 100000, normalize = True)
lassocv.fit(X_train, y_train)

lasso.set_params(alpha=lassocv.alpha_)
lasso.fit(X_train, y_train)

# Finding the mean average error of LASSO regression in this dataset
print("LASSO Validation Mean average error",mae(y_test, lasso.predict(X_test)))

# Finding the R2 score which is a statistical measure of how close the data are to the fitted regression line
# The higher the R-squared, the better the model fits the data
R2score = r2_score(y_test, lasso.predict(X_test))
print("R2 score using LASSO Regression",R2score)

In [12]:
# Artificial Neural Network

# Standardization of dataset
# Scale the data values using min-max scaling to improve training
# Min-max normalization retains the original distribution of scores except for a scaling factor and transforms all the scores into a common range [0, 1].
scaler = MinMaxScaler()

scaler.fit(X)
X_Neural = scaler.transform(X)
scaler.fit(Y)
Y_Neural = scaler.transform(Y)

# Prepare testing and training scaled data for Artificial Neural Network
seed = 5
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X_Neural, Y_Neural, test_size=test_size, random_state=seed)

In [None]:
# Designing the model
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(15, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))

# The three Hidden Layers :
NN_model.add(Dense(10, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(8, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(8, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

In [None]:
# Running the model
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]
NN_model.fit(X_train, y_train, epochs=200, batch_size=32, validation_split = 0.2, callbacks=callbacks_list)

In [None]:
# Calculating the execution-time using Artificial Neural Network
predictions = NN_model.predict(X_test)

# Accuracy of the model
print("Neural Network Mean Absolute Error: " + str(mae(y_test,predictions)))
score = r2_score(y_test, predictions)
print("R2 score using ANN",R2score)