## Loading Pre-processed Dataset and importing libraries 

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.tree import DecisionTreeRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor ,GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer, PowerTransformer, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

In [49]:
df  = pd.read_csv(r"C:\Users\kajoo\Jupyter Projects\Final_regression_Dataset.csv") #asked cgat gpt to add more rows >_<

## Checking feature importance 
- Only using features with >= 0.01 correlation

In [51]:
numeric_cols = [col for col in df.select_dtypes(include=np.number).columns if col != 'Value_SAR']
correlations = df[numeric_cols + ['Value_SAR']].corr()['Value_SAR'].sort_values(key=abs, ascending=False)
print(correlations.head(30))


Value_SAR                      1.000000
Value_SAR_log                  0.411918
cat_mean_value                 0.241500
tfidf_49                       0.233057
cat_std                        0.219789
category_Operations            0.194177
tfidf_41                       0.188846
keyword_construction           0.186388
log_cat_mean                   0.184752
keyword_maintenance            0.183776
cat_median_value               0.169459
entity_mean_value              0.151701
entity_std                     0.141130
duration_x_cat_mean            0.139267
log_entity_mean                0.127273
tfidf_3                        0.105267
keyword_government            -0.102637
category_Supply               -0.100622
cat_freq                      -0.099091
tfidf_21                       0.095721
duration_x_mean_entity         0.085434
Entity_مؤسسة البريد السعودي    0.084226
keyword_finance               -0.083788
entity_freq_x_cat_freq        -0.080093
category_Engineering           0.078182


In [387]:
high_corr_cols = correlations[correlations.abs() >= 0.01].index.drop(["Value_SAR"])

## Definig X and y and spliting 
- using a 80/20 split

In [391]:
X = df[high_corr_cols]
y = df["Value_SAR"]

In [463]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear regression

In [465]:
pipe = Pipeline([#("norm",Normalizer()),
    #("poly",PolynomialFeatures(degree=2,include_bias=False)),
                 ("scale",StandardScaler()),
                 ("lr",LinearRegression())
                ])

In [467]:
pipe.fit(X_train, y_train)

In [469]:
print("Train R² score:", pipe.score(X_train, y_train))
print("Test  R² score:", pipe.score(X_test, y_test))

Train R² score: 0.3220296903205272
Test  R² score: 0.05479579759674891


### Lasso with CV

In [471]:
pipe2 = Pipeline([#("poly",PolynomialFeatures(degree=2,include_bias=False)),
                  ("ss",StandardScaler()),
                  ("lasso",LassoCV(cv=10))
                ])

In [473]:
pipe2.fit(X_train,y_train)

In [474]:
print("Train R² score:", pipe2.score(X_train, y_train))
print("Test  R² score:", pipe2.score(X_test, y_test))

Train R² score: 0.3166055453403386
Test  R² score: 0.08082705701722737


### Random forest

In [531]:
pipe3 = Pipeline([#("poly",PolynomialFeatures(degree=2,include_bias=False)),
                  ("ss",StandardScaler()),
                  ("rt",RandomForestRegressor(max_depth=4,random_state=42,n_estimators=100))
])

In [533]:
pipe3.fit(X_train,y_train)

In [534]:
print("Train R² score:", pipe3.score(X_train, y_train))
print("Test  R² score:", pipe3.score(X_test, y_test))


Train R² score: 0.9811754556407193
Test  R² score: 0.9846254013223924


### Gradient boosting 

In [483]:
pipe5 = Pipeline([
    ("ss",StandardScaler()),
    ("model",GradientBoostingRegressor())
])

In [485]:
pipe5.fit(X_train,y_train)

In [486]:
print("Train R² score:", pipe5.score(X_train, y_train))
print("Test  R² score:", pipe5.score(X_test, y_test))

Train R² score: 0.9999889909579566
Test  R² score: 0.9565052770435388


### Ridge with high regulaization

In [489]:
pipe6 = Pipeline([
    ('scaler', StandardScaler()),
    ('model', Ridge(alpha=9.0,fit_intercept=True))
])

In [491]:
pipe6.fit(X_train,y_train)

In [493]:
print("Train R² score:", pipe6.score(X_train, y_train))
print("Test  R² score:", pipe6.score(X_test, y_test))

Train R² score: 0.31777806841532763
Test  R² score: 0.0822254523168876


### Decision tree

In [495]:
pipe7 = Pipeline([("scale",StandardScaler()),
                 ("model",DecisionTreeRegressor(random_state=42,max_depth=5))])

In [497]:
pipe7.fit(X_train,y_train)

In [499]:
print("Train R² score:", pipe7.score(X_train, y_train))
print("Test  R² score:", pipe7.score(X_test, y_test))

Train R² score: 0.999319828047737
Test  R² score: 0.953976465704347
