In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

# Preprocessing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [33]:
# Load the dataset
df = pd.read_csv("saudi_projects_v02.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,sectors,sector_budgets,type_project,budget_project,startday_project,start_year,start_month,enddate_project,end_year,end_month,duration_project,project_area,region_project,status_project
0,0,Commercial,1058790791316,Commercial,335000000.0,2019-07-03,2019.0,7.0,2021-12-31,2021.0,12.0,912.0,7752.0,ALDAMMAM,Under the construction
1,1,Commercial,1058790791316,"Commercial, Residential",,2010-01-01,2010.0,1.0,,,,,8000.0,RIYADH,Under the construction
2,2,Commercial,1058790791316,"Commercial, Residential",1178000000.0,2009-01-01,2009.0,1.0,2014-12-31,2014.0,12.0,2190.0,19500.0,RIYADH,Complete
3,3,Commercial,1058790791316,Commercial,,,,,,,,,19888.0,ALHASSA,Announced
4,4,Commercial,1058790791316,"Hotel, Commercial",120000000.0,2019-06-01,2019.0,6.0,2022-03-31,2022.0,3.0,1034.0,3651.0,JEDDAH,Under the construction


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2179 entries, 0 to 2178
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        2179 non-null   int64  
 1   sectors           2179 non-null   object 
 2   sector_budgets    2179 non-null   int64  
 3   type_project      2179 non-null   object 
 4   budget_project    847 non-null    float64
 5   startday_project  1185 non-null   object 
 6   start_year        1185 non-null   float64
 7   start_month       1185 non-null   float64
 8   enddate_project   921 non-null    object 
 9   end_year          921 non-null    float64
 10  end_month         921 non-null    float64
 11  duration_project  759 non-null    float64
 12  project_area      1430 non-null   float64
 13  region_project    2178 non-null   object 
 14  status_project    2179 non-null   object 
dtypes: float64(7), int64(2), object(6)
memory usage: 255.5+ KB


In [35]:
# will delete the null values before splitting since it's in the target column
df = df[df['budget_project'].notna()]
df.drop('Unnamed: 0', axis='columns', inplace=True)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 847 entries, 0 to 2176
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   sectors           847 non-null    object 
 1   sector_budgets    847 non-null    int64  
 2   type_project      847 non-null    object 
 3   budget_project    847 non-null    float64
 4   startday_project  647 non-null    object 
 5   start_year        647 non-null    float64
 6   start_month       647 non-null    float64
 7   enddate_project   540 non-null    object 
 8   end_year          540 non-null    float64
 9   end_month         540 non-null    float64
 10  duration_project  483 non-null    float64
 11  project_area      569 non-null    float64
 12  region_project    847 non-null    object 
 13  status_project    847 non-null    object 
dtypes: float64(7), int64(1), object(6)
memory usage: 99.3+ KB


In [37]:
# Split the data
train, test = train_test_split(
    df,
    test_size=0.2,
    train_size = 0.8,
    random_state=9000
)

# Create X, y train and test sets
target = "budget_project"

X_train = train.drop(target, axis=1)
y_train = train[target]

X_test = test.drop(target, axis=1)
y_test = test[target]

In [38]:
numeric_features = X_train.describe().columns 
categorical_features = X_train.describe(exclude="number").columns

### PipeLine with Linear Regression

In [39]:
#pipeline numeric and categorical data

# Create a transformer for numeric columns

numeric_transformer = Pipeline(
    steps=[
        ('imputer', KNNImputer(n_neighbors=10)),
        ('scaler', StandardScaler())
    ]
)

# Create Transformer for categorical data

categorical_transformer = Pipeline(
    steps=[
        ('one_hot', OneHotEncoder(handle_unknown='ignore')) 
    ]
)

# Create a preprocessor transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# PipeLine with Linear Regression
pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('reg', LinearRegression())
    ]
)

pipe.fit(X_train, y_train)

preds= pipe.predict(X_test)
mean_absolute_error(y_true=y_test, y_pred=preds)

5263010274.399482

### PipeLine with Random Forest Regressor

In [40]:
#pipeline numeric and categorical data

# Create a transformer for numeric columns

numeric_transformer = Pipeline(
    steps=[
        ('imputer', KNNImputer(n_neighbors=10)),
        ('scaler', StandardScaler())
    ]
)

# Create Transformer for categorical data

categorical_transformer = Pipeline(
    steps=[
        ('one_hot', OneHotEncoder(handle_unknown='ignore')) 
    ]
)

# Create a preprocessor transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# PipeLine with Random Forest Regressor.
pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('reg', RandomForestRegressor(n_estimators = 10, random_state = 0, criterion = 'mse'))
    ]
)

pipe.fit(X_train, y_train)

preds= pipe.predict(X_test)
mean_absolute_error(y_true=y_test, y_pred=preds)

2385083736.346667

### PipeLine with SVR

In [41]:
#pipeline numeric and categorical data

# Create a transformer for numeric columns

numeric_transformer = Pipeline(
    steps=[
        ('imputer', KNNImputer(n_neighbors=10)),
        ('scaler', StandardScaler())
    ]
)

# Create Transformer for categorical data

categorical_transformer = Pipeline(
    steps=[
        ('one_hot', OneHotEncoder(handle_unknown='ignore')) 
    ]
)

# Create a preprocessor transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# PipeLine with SVR
pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('reg', SVR(kernel = 'linear'))
    ]
)

pipe.fit(X_train, y_train)

preds= pipe.predict(X_test)
mean_absolute_error(y_true=y_test, y_pred=preds)

5192051953.623516