## Runs a feature importance model

In [None]:
import xgboost as xgb

target = 'NRFI'

def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target])
        
    #Predict training set:
    y_pred = alg.predict(dtrain[predictors])
        
    #Convert predictions to pandas series
    y_pred_series = pd.Series(y_pred)
        
    #Print model report:
    #print("\nModel Report")
    #print(f"Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, y_pred_series))
    #print(f"AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target].values, y_pred_series))
                    
    feat_imp = pd.Series(alg.feature_importances_).sort_values(ascending=False)
    fig, ax = plt.subplots()
    feat_imp.plot(kind='bar', title='Feature')
    #plt.xticks(range(len(feat_imp)), [predictors[i] for i in range(len(feat_imp))])

predictors = [x for x in Train5.columns if x not in [target]]
xgb1 = XGBRegressor(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    nthread=4,
    scale_pos_weight=1,
    seed=27)

modelfit(xgb1, Train5, predictors)

## Performing Cross Validation to see the best parameters to use

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

TrainFeatures = Train5.drop(columns = ["NRFI"])
TrainLabel = Train5["NRFI"]

# Define the parameter grid for RandomForestRegressor
param_test1 = {
    'n_estimators': range(140, 160, 2),
    'min_samples_leaf': range(2, 6, 2),
    'max_depth': [10, 20, 2]
}

# Set up the GridSearchCV with RandomForestRegressor
gsearch1 = GridSearchCV(
    estimator=RandomForestRegressor(
        n_estimators=140,
        max_depth=5,
        min_samples_leaf=1,
        random_state=27
    ),
    param_grid=param_test1,
    scoring='neg_root_mean_squared_error',
    n_jobs=4,
    cv=5
)

gsearch1.fit(TrainFeatures, TrainLabel)

print(gsearch1.cv_results_)
print(gsearch1.best_params_)
print(gsearch1.best_score_)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

TrainFeatures = Train5.drop(columns = ["NRFI"])
TrainLabel = Train5["NRFI"]

# Define the parameter grid for GradientBoostingRegressor
param_test1 = {
    'n_estimators': range(90, 115, 2),
    'min_samples_leaf': range(2, 8, 2),
    'max_depth': [5, 15, 2]
}

# Set up the GridSearchCV with GradientBoostingRegressor
gsearch1 = GridSearchCV(
    estimator=GradientBoostingRegressor(
        n_estimators=140,
        max_depth=5,
        min_samples_leaf=1,
        random_state=27
    ),
    param_grid=param_test1,
    scoring='neg_root_mean_squared_error',
    n_jobs=4,
    cv=5
)

gsearch1.fit(TrainFeatures, TrainLabel)

print(gsearch1.cv_results_)
print(gsearch1.best_params_)
print(gsearch1.best_score_)

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

TrainFeatures = Train5.drop(columns = ["NRFI"])
TrainLabel = Train5["NRFI"]

param_test1 = {
 'n_estimators': range(40, 70, 2),
 'min_child_weight': range(18, 22, 2),
 'max_depth': [20, 60, 2],
 "scale_pos_weight": [1, 5, 1]
}

gsearch1 = GridSearchCV(
    estimator=XGBRegressor(
        learning_rate=0.1,
        n_estimators=140,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='reg:squarederror',
        nthread=4,
        scale_pos_weight=1,
        seed=27
    ),
    param_grid=param_test1,
    scoring='neg_root_mean_squared_error',
    n_jobs=4,
    cv=5
)

gsearch1.fit(TrainFeatures, TrainLabel)

print(gsearch1.cv_results_)
print(gsearch1.best_params_)
print(gsearch1.best_score_)

## Train Test Splits

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(TrainFeatures, TrainLabel, test_size=0.2, random_state=42)

# Model training
rf_regressor = RandomForestRegressor(n_estimators=152, max_depth=15, min_samples_leaf=4)
rf_regressor.fit(X_train, y_train.ravel())  # Use .ravel() to convert y_train to the correct shape

# Model prediction on the test set
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = mse ** 0.5

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
print(f"Root Mean Squared Error: {rmse}")