## Gradientboosting regressor : 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = (9,5)
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

In [2]:
df=pd.read_csv("car_dataset.csv")
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [5]:
df.isnull().sum().any()

False

In [6]:
df["vehicle_age"]=2021-df.Year

In [7]:
df.drop(columns=["Year"], inplace=True)

### Train test split and Ordinalencoder : 

In [8]:
X=df.drop("Selling_Price", axis=1)
y=df.Selling_Price

from sklearn.preprocessing import OrdinalEncoder

cat = X.select_dtypes("object").columns
enc = OrdinalEncoder()
X[cat] = enc.fit_transform(X[cat])

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=5)

### Default Modeling : 

In [9]:
from sklearn.ensemble import GradientBoostingRegressor

gr_model = GradientBoostingRegressor(random_state = 101)
gr_model.fit(X_train, y_train)

GradientBoostingRegressor(random_state=101)

### Metrics : 

In [10]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
def train_val(model, X_train, y_train, X_test, y_test):
    
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    
    scores = {"train": {"R2" : r2_score(y_train, y_train_pred),
    "mae" : mean_absolute_error(y_train, y_train_pred),
    "mse" : mean_squared_error(y_train, y_train_pred),                          
    "rmse" : np.sqrt(mean_squared_error(y_train, y_train_pred))},
    
    "test": {"R2" : r2_score(y_test, y_pred),
    "mae" : mean_absolute_error(y_test, y_pred),
    "mse" : mean_squared_error(y_test, y_pred),
    "rmse" : np.sqrt(mean_squared_error(y_test, y_pred))}}
    
    return pd.DataFrame(scores)

In [11]:
train_val(gr_model, X_train, y_train, X_test, y_test)

Unnamed: 0,train,test
R2,0.996501,0.970278
mae,0.217741,0.533569
mse,0.088991,0.79777
rmse,0.298314,0.89318


### Cross Validate : 

In [13]:
from sklearn.model_selection import cross_validate

model = GradientBoostingRegressor(random_state=101)
scores = cross_validate(model, X_train, y_train, scoring=['r2', 
            'neg_mean_absolute_error','neg_mean_squared_error','neg_root_mean_squared_error'], cv =10)
pd.DataFrame(scores)
pd.DataFrame(scores).mean()[2:]

test_r2                             0.890442
test_neg_mean_absolute_error       -0.751150
test_neg_mean_squared_error        -3.348033
test_neg_root_mean_squared_error   -1.552812
dtype: float64

### Gradient Boosting Regressor : 

In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = {"n_estimators":[100, 200, 300], "subsample":[0.8, 1], "max_features" : [None, 2, 3, 4],
            "learning_rate": [0.001, 0.01, 0.1, 0.5], 'max_depth':[2,3]}

model = GradientBoostingRegressor(random_state=101)

grid_model = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          scoring='neg_root_mean_squared_error',
                          cv=10,
                          n_jobs = -1)

grid_model.fit(X_train, y_train)


GridSearchCV(cv=10, estimator=GradientBoostingRegressor(random_state=101),
             n_jobs=-1,
             param_grid={'learning_rate': [0.001, 0.01, 0.1, 0.5],
                         'max_depth': [2, 3], 'max_features': [None, 2, 3, 4],
                         'n_estimators': [100, 200, 300],
                         'subsample': [0.8, 1]},
             scoring='neg_root_mean_squared_error')

In [15]:
grid_model.best_params_

{'learning_rate': 0.1,
 'max_depth': 2,
 'max_features': 4,
 'n_estimators': 300,
 'subsample': 0.8}

In [16]:
train_val(grid_model, X_train, y_train, X_test, y_test)

Unnamed: 0,train,test
R2,0.99628,0.95419
mae,0.228673,0.58534
mse,0.09462,1.22957
rmse,0.307604,1.10886


In [17]:
train_val(gr_model, X_train, y_train, X_test, y_test)

Unnamed: 0,train,test
R2,0.996501,0.970278
mae,0.217741,0.533569
mse,0.088991,0.79777
rmse,0.298314,0.89318
