# Importing Libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout

In [5]:
df = pd.read_csv('Final Dataset')

In [6]:
df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], inplace=True)

# Normalising the data

In [7]:
from sklearn.preprocessing import Normalizer
norm = Normalizer()
df = norm.fit_transform(df)
df = pd.DataFrame(df, columns= ['Budget Forecast', 'Total Budget Changes', 'Total Schedule Changes', 'Description Embedding'])
df.head()

Unnamed: 0,Budget Forecast,Total Budget Changes,Total Schedule Changes,Description Embedding
0,0.999906,0.013741,1.084248e-05,-1.644953e-10
1,1.0,-0.000717,0.0,-1.370162e-10
2,0.999979,0.006505,2.355953e-06,-3.448e-11
3,0.99993,0.011812,1.151342e-06,-2.733733e-11
4,0.970381,0.241581,5.114135e-07,-6.248411e-11


# Splitting the data into train n testing data

In [8]:
x = df.drop(columns = 'Budget Forecast')
y = df['Budget Forecast']

In [9]:
print(x.shape), print(y.shape)

(2846, 3)
(2846,)


(None, None)

In [10]:
y = np.reshape(y, (-1,1))
print(y.shape)

(2846, 1)


In [11]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(x,y ,
                                   random_state=104, 
                                   test_size=0.3, 
                                   shuffle=True)

In [12]:
#Test size is 30% 

In [115]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
from xgboost import XGBRegressor

lr = LinearRegression()
knn = KNeighborsRegressor(n_neighbors=3)
dt = DecisionTreeRegressor(min_samples_leaf=3)
svr = SVR(degree=3, kernel='rbf', gamma='auto', epsilon=0.2)
rf = RandomForestRegressor()

xgb = XGBRegressor(n_estimators=45)
etr = ExtraTreesRegressor(n_estimators=50, random_state=2)

# Fitting training data into each model

In [118]:
# Using ML aglorithms:

#Linear Regression
#K Nearest neighbours
#Decision Tree
#Support Vector Machine
#Random Forest
#Artificial Neural Network
#XG Boost
# Extra tree regressor

In [109]:
# Metrics used:
# Mean Squared Error
# Mean absolute error
# R2 Score

In [99]:
models = [lr, knn, dt, svr, rf]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    print(f"MAE for {model} is {mean_absolute_error(y_train, y_pred)}")
    print(f"MSE for {model} is {mean_squared_error(y_train, y_pred)}")
    print(f"R2 Score for {model} is {r2_score(y_train, y_pred)}")
    
    
    print('\n')

MAE for LinearRegression() is 0.07075044495763338
MSE for LinearRegression() is 0.018410939555956924
R2 Score for LinearRegression() is 0.10490373681657006


MAE for KNeighborsRegressor(n_neighbors=3) is 0.0007480562762510897
MSE for KNeighborsRegressor(n_neighbors=3) is 0.00039755875746494895
R2 Score for KNeighborsRegressor(n_neighbors=3) is 0.9806716350829805


MAE for DecisionTreeRegressor(min_samples_leaf=3) is 0.001684915964363574
MSE for DecisionTreeRegressor(min_samples_leaf=3) is 0.000585035569200904
R2 Score for DecisionTreeRegressor(min_samples_leaf=3) is 0.9715569566545187


MAE for SVR(epsilon=0.2, gamma='auto') is 0.13959297969060694
MSE for SVR(epsilon=0.2, gamma='auto') is 0.020823585996455188
R2 Score for SVR(epsilon=0.2, gamma='auto') is -0.012393417231935322




  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


MAE for RandomForestRegressor() is 0.0004200532502029479
MSE for RandomForestRegressor() is 8.770420687834433e-05
R2 Score for RandomForestRegressor() is 0.9957360292448045




# Building dataframe to compare MAE, MSE, R2 SCORE of all models

In [116]:
models = [lr, knn, dt, svr, rf, xgb, etr]
data = []
algos = ['LR' , 'KNN', 'DT', 'SVR', 'RF', 'XGB', 'ETR']
i = 0

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    data.append([algos[i], mean_squared_error(y_train, y_pred), mean_absolute_error(y_train, y_pred), r2_score(y_train, y_pred)])
    i+=1

    
accuracy = pd.DataFrame(data, columns=['Algo', 'MSE', 'MAE', 'R2'])

  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


In [117]:
accuracy.sort_values(by='MAE')

Unnamed: 0,Algo,MSE,MAE,R2
6,ETR,3.287337e-18,4.235643e-10,1.0
5,XGB,5.883171e-07,0.0004247098,0.999971
4,RF,0.0001389581,0.0005088937,0.993244
1,KNN,0.0003975588,0.0007480563,0.980672
2,DT,0.0005850356,0.001684916,0.971557
0,LR,0.01841094,0.07075044,0.104904
3,SVR,0.02082359,0.139593,-0.012393


In [122]:
accuracy.sort_values(by='R2')

Unnamed: 0,Algo,MSE,MAE,R2
3,SVR,0.02082359,0.139593,-0.012393
0,LR,0.01841094,0.07075044,0.104904
2,DT,0.0005850356,0.001684916,0.971557
1,KNN,0.0003975588,0.0007480563,0.980672
4,RF,0.0001389581,0.0005088937,0.993244
5,XGB,5.883171e-07,0.0004247098,0.999971
6,ETR,3.287337e-18,4.235643e-10,1.0
