In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split


# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder


# Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import plot_confusion_matrix

In [2]:
# Load the dataset
df = pd.read_csv("saudi_projects_v02.csv")

In [3]:
# Handling missing values
df = df.drop(['Unnamed: 0'],axis=1)
cols = ['region_project']
df[cols] = df[cols].fillna(df.mode().iloc[0])
mean = ['project_area']
df[mean] = df[mean].fillna(df.mean().iloc[0])
df = df.dropna(subset=['budget_project'])
df = df[df['budget_project'] <= 86537642000]

In [4]:
# Split the dataset 

train, test = train_test_split(
    df,
    test_size=0.2,
    train_size = 0.8,
    random_state=9000
)

In [5]:
X_train = df.drop(['budget_project','end_month','end_year','enddate_project','start_month', 'start_year','startday_project','duration_project'],axis = 1)
y_train = df['budget_project']

X_test = df.drop(['budget_project','end_month','end_year','enddate_project','start_month', 'start_year','startday_project','duration_project'],axis = 1)
y_test = df['budget_project']

X_train.shape

(835, 6)

In [6]:
# Define a function to calculate the cost for each model
def reg_cost(method, actual, preds):
    mae = mean_absolute_error(y_true=actual, y_pred=preds)
    mse = mean_squared_error(y_true=actual, y_pred=preds)
    
    print(f'Cost functions for the {method} regression is:')
    print(f'Mean Square Error: {round(mse,2)}')
    print(f'Mean Absolute Error: {round(mae,2)}\n\n')

In [7]:
#ordinal encoder
ordinal = OrdinalEncoder()

col_names = ["sectors","type_project","region_project","status_project"]

## Ordinal encode the column
ordinal_ls = ordinal.fit_transform(X_train[col_names])
ordinal_ls_val = ordinal.transform(X_test[col_names])

In [8]:
ordinal.categories_

[array([' Charity', ' Commercial', ' Educational', ' Governmental',
        ' Health', ' Industrial', ' Residential', ' Scientific', ' Sports',
        ' Tourist'], dtype=object),
 array(['Charity', 'Charity, Health', 'Charity, Residential', 'Commercial',
        'Commercial, Educational',
        'Commercial, Educational, Charity, Residential, Health',
        'Commercial, Educational, Governmental, Charity, Residential, Health',
        'Commercial, Educational, Health',
        'Commercial, Educational, Residential', 'Commercial, Governmental',
        'Commercial, Health', 'Commercial, Industrial',
        'Commercial, Residential', 'Commercial, Tourist',
        'Commercial, Tourist, Governmental, Sports',
        'Commercial, Tourist, Residential', 'Educational',
        'Educational, Charity', 'Educational, Charity, Sports',
        'Educational, Governmental', 'Educational, Governmental, Health',
        'Educational, Governmental, Residential', 'Educational, Health',
        '

In [9]:
X_train[col_names] = ordinal_ls
X_test[col_names] = ordinal_ls_val

X_train.sample(40)

Unnamed: 0,sectors,sector_budgets,type_project,project_area,region_project,status_project
2107,4.0,549859453510,75.0,7500.0,44.0,4.0
1423,0.0,473524096855,40.0,7000.0,40.0,6.0
2052,4.0,549859453510,29.0,86272.0,33.0,3.0
1472,8.0,396629702256,14.0,350000.0,33.0,2.0
1113,3.0,1436611825301,29.0,1500000.0,0.0,3.0
1742,6.0,709916354518,85.0,919853200000.0,48.0,6.0
360,1.0,1058790791316,68.0,919853200000.0,33.0,4.0
470,1.0,1058790791316,12.0,5000.0,33.0,2.0
1082,3.0,1436611825301,77.0,1999999.0,33.0,3.0
333,1.0,1058790791316,3.0,919853200000.0,33.0,4.0


In [10]:

scaler = StandardScaler()

## Fit & transform data.
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

In [11]:
# Linear Regression
reg_lin = LinearRegression()
reg_lin.fit(X_train_sc, y_train)

preds_lin = reg_lin.predict(X_test_sc)
mean_absolute_error(y_true=y_test, y_pred=preds_lin)

4385687863.461437

In [12]:
# Decision Tree Regressor
reg_tree = DecisionTreeRegressor(random_state = 0, max_depth= 4, criterion= 'mse')
reg_tree.fit(X_train_sc, y_train)

preds_tree = reg_tree.predict(X_test_sc)

mean_absolute_error(y_true=y_test, y_pred=preds_tree)

2921435460.3583984

In [13]:
# Random Forest Regressor
reg_forest = RandomForestRegressor(n_estimators = 10, random_state = 0, criterion = 'mse')
reg_forest.fit(X_train_sc, y_train)

preds_forest = reg_forest.predict(X_test_sc)
mean_absolute_error(y_true=y_test, y_pred=preds_forest)

991374043.9535304

In [14]:
# SVR
reg_svr = SVR(kernel = 'linear')
reg_svr.fit(X_train_sc, y_train)

SVR(kernel='linear')

In [15]:
preds_svr = reg_svr.predict(X_test_sc)
mean_absolute_error(y_true=y_test, y_pred=preds_svr)

3085830004.047136

In [16]:
# XGB regressor
xgb_reg = xgb.XGBRegressor(objective = "reg:linear",
                           n_estimators = 75,
                           subsample = 0.75,
                           max_depth = 7)
xgb_reg.fit(X_train_sc, y_train)

preds_xgb = xgb_reg.predict(X_test_sc)
mean_absolute_error(y_true=y_test, y_pred=preds_xgb)



212824308.35

In [17]:
# Cost Function for all the models
model_name = ['linear', 'Decision Tree', 'Random Forest', 'Support Vector', 'XGB regressor']
model_pred = [preds_lin, preds_tree, preds_forest, preds_svr,preds_xgb]

for x in range(len(model_pred)):
    
    reg_cost(model_name[x],y_test, model_pred[x] )

Cost functions for the linear regression is:
Mean Square Error: 7.757141647892858e+19
Mean Absolute Error: 4385687863.46


Cost functions for the Decision Tree regression is:
Mean Square Error: 4.435618145589347e+19
Mean Absolute Error: 2921435460.36


Cost functions for the Random Forest regression is:
Mean Square Error: 7.131113902566983e+18
Mean Absolute Error: 991374043.95


Cost functions for the Support Vector regression is:
Mean Square Error: 8.776748878653682e+19
Mean Absolute Error: 3085830004.05


Cost functions for the XGB regressor regression is:
Mean Square Error: 4.23246251058303e+17
Mean Absolute Error: 212824308.35




In [22]:
# Save the model
import pickle
filename = 'saudi_projects_regression.pkl'
pickle.dump(xgb_reg, open(filename, 'wb'))

In [23]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test_sc, y_test)
print(result)

0.994687824252392
