# GRADIENT BOOSTING REGRESSOR ALGORITHEM

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:

df=pd.read_csv('DATASET.csv')
df.head()


Unnamed: 0,Age,Credit_Score,Account_Balence,Credit_Card_Holter,Dependents,Employment_Type,Applicant_Income,Coapplicant_Income,Loan_Status,Education_Status,Fidelity_Scores
0,86,451,825749,1,1,Employed,128630,81915,0,Graduated,51
1,64,465,140667,0,0,Employed,75633,78479,1,Graduated,59
2,65,409,716824,0,2,Self_Employed,178202,174249,0,Non-Graduated,99
3,57,446,323539,0,2,Employed,180802,96209,1,Non-Graduated,28
4,67,475,973045,1,2,Employed,80110,25057,1,Non-Graduated,23


In [3]:

df.tail()


Unnamed: 0,Age,Credit_Score,Account_Balence,Credit_Card_Holter,Dependents,Employment_Type,Applicant_Income,Coapplicant_Income,Loan_Status,Education_Status,Fidelity_Scores
4994,38,445,560452,0,2,Self_Employed,37339,124773,1,Graduated,83
4995,26,453,471969,1,2,Employed,143643,96357,0,Graduated,95
4996,77,495,359288,1,2,Employed,10643,97925,1,Graduated,29
4997,59,447,504988,1,1,Employed,27886,101742,0,Graduated,53
4998,31,436,971961,1,1,Self_Employed,21162,22299,0,Non-Graduated,24


In [4]:

df.columns


Index(['Age', 'Credit_Score', 'Account_Balence', 'Credit_Card_Holter',
       'Dependents', 'Employment_Type', 'Applicant_Income',
       'Coapplicant_Income', 'Loan_Status', 'Education_Status',
       'Fidelity_Scores'],
      dtype='object')

In [5]:

X = df.drop(labels='Fidelity_Scores', axis=1)
y = df.loc[:,'Fidelity_Scores']


In [6]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print("NUMBER OF TRAIN DATASET    : ", len(X_train))
print("NUMBER OF TEST DATASET      : ", len(X_test))
print("TOTAL NUMBER OF DATASET    : ", len(X_train)+len(X_test))


NUMBER OF TRAIN DATASET    :  3999
NUMBER OF TEST DATASET      :  1000
TOTAL NUMBER OF DATASET    :  4999


In [7]:

print("NUMBER OF TRAIN DATASET    : ", len(y_train))
print("NUMBER OF TEST DATASET      : ", len(y_test))
print("TOTAL NUMBER OF DATASET    : ", len(y_train)+len(y_test))


NUMBER OF TRAIN DATASET    :  3999
NUMBER OF TEST DATASET      :  1000
TOTAL NUMBER OF DATASET    :  4999


In [8]:

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns


In [9]:

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [10]:

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])


In [11]:

from sklearn.preprocessing import OneHotEncoder


In [12]:

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


In [13]:

from sklearn.compose import ColumnTransformer


In [14]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [15]:

from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor

In [16]:

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor())])


In [17]:

param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [3, 4, 5],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__subsample': [0.6, 0.8, 1.0]
}


In [18]:

from sklearn.model_selection import GridSearchCV


In [19]:

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train) 


Fitting 5 folds for each of 729 candidates, totalling 3645 fits


In [20]:

print("Best parameters:", grid_search.best_params_)


Best parameters: {'regressor__learning_rate': 0.01, 'regressor__max_depth': 4, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100, 'regressor__subsample': 0.6}


In [21]:

best_pipeline = grid_search.best_estimator_


In [22]:

predicted = best_pipeline.predict(X_test)


In [23]:

from sklearn.metrics import rand_score
RS = rand_score(y_test, predicted)
print("MEAN RAND SCOORE OF GRADIENT BOOSTING REGRESSOR ALGORITHEM",RS*100)


MEAN RAND SCOORE OF GRADIENT BOOSTING REGRESSOR ALGORITHEM 98.75095095095095




In [24]:

from sklearn.metrics import r2_score
R2 = r2_score(y_test, predicted)
print("MEAN R2 SCOORE OF GRADIENT BOOSTING REGRESSOR ALGORITHEM",R2*100)


MEAN R2 SCOORE OF GRADIENT BOOSTING REGRESSOR ALGORITHEM -0.36966092349943747


In [25]:

from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test, predicted)
print("MEAN SQUARE ERROR OF GRADIENT BOOSTING REGRESSOR ALGORITHEM",MSE)


MEAN SQUARE ERROR OF GRADIENT BOOSTING REGRESSOR ALGORITHEM 528.2862313600618


In [26]:

from sklearn.metrics import mean_squared_error
RMSE = mean_squared_error(y_test, predicted, squared=False)
print(" ROOT MEAN SQUARE ERROR OF GRADIENT BOOSTING REGRESSOR ALGORITHEM",RMSE)


 ROOT MEAN SQUARE ERROR OF GRADIENT BOOSTING REGRESSOR ALGORITHEM 22.98447805280907


In [27]:

from sklearn.metrics import mean_absolute_error
MAE = mean_absolute_error(y_test, predicted)
print("MEAN ABSOLUTE ERROR OF GRADIENT BOOSTING REGRESSOR ALGORITHEM",MAE)


MEAN ABSOLUTE ERROR OF GRADIENT BOOSTING REGRESSOR ALGORITHEM 19.872419114826943


In [28]:

from sklearn.metrics import explained_variance_score
EVS = explained_variance_score(y_test, predicted)
print("EXPLAINED VARIANCE SCORE OF GRADIENT BOOSTING REGRESSOR ALGORITHEM",EVS)


EXPLAINED VARIANCE SCORE OF GRADIENT BOOSTING REGRESSOR ALGORITHEM -0.002033132373730151


In [30]:

import joblib
joblib.dump(best_pipeline, 'GB.pkl')


['GB.pkl']