Decision Tree Regression
CART - Classification and Regression Tree

![image.png](attachment:image.png)

In [7]:
from warnings import filterwarnings
filterwarnings('ignore')

Step 1: Read the training dataset

In [8]:
import pandas as pd
df=pd.read_csv("50_Startups (1).csv")
df

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [9]:
df.head()

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


Step 2: Perfom basic data quality checks

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RND     50 non-null     float64
 1   ADMIN   50 non-null     float64
 2   MKT     50 non-null     float64
 3   STATE   50 non-null     object 
 4   PROFIT  50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [11]:
df.isna().sum()

RND       0
ADMIN     0
MKT       0
STATE     0
PROFIT    0
dtype: int64

In [12]:
df.duplicated().sum()

0

Step 3 : Seperate X and Y (PROFIT)

In [13]:
X=df.drop(columns=["PROFIT"])
Y=df[['PROFIT']]

In [14]:
X.head()

Unnamed: 0,RND,ADMIN,MKT,STATE
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [15]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94



Step 4: Create preprocessing pipeline on X

In [17]:
X.dtypes

RND      float64
ADMIN    float64
MKT      float64
STATE     object
dtype: object

In [23]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [24]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [25]:
num_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

In [27]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False))

In [28]:
pre = ColumnTransformer([('num', num_pipe, con),
                         ('cat', cat_pipe, cat)]).set_output(transform='pandas')

In [29]:
X_pre = pre.fit_transform(X)
X_pre.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
0,2.016411,0.560753,2.153943,0.0,0.0,1.0
1,1.95586,1.082807,1.9236,1.0,0.0,0.0
2,1.754364,-0.728257,1.626528,0.0,1.0,0.0
3,1.554784,-0.096365,1.42221,0.0,0.0,1.0
4,1.504937,-1.079919,1.281528,0.0,1.0,0.0


Step 5: Train Test Split

In [30]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(
    X_pre,
    Y,
    test_size=0.2,
    random_state=21
)

In [31]:
xtrain.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
30,-0.258074,-0.205629,-0.990357,0.0,1.0,0.0
21,0.102724,1.169186,0.732788,0.0,0.0,1.0
19,0.279442,1.159837,-1.743127,0.0,0.0,1.0
11,0.593085,-1.06554,0.319834,1.0,0.0,0.0
22,0.006007,0.05185,0.762376,0.0,1.0,0.0


In [32]:
ytrain.head()

Unnamed: 0,PROFIT
30,99937.59
21,111313.02
19,122776.86
11,144259.4
22,110352.25


In [33]:
xtest.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
7,1.245057,0.87198,0.932186,0.0,1.0,0.0
44,-1.134305,1.206419,-1.509074,1.0,0.0,0.0
43,-1.281134,0.217682,-1.449605,0.0,0.0,1.0
25,-0.199312,0.656489,-0.603517,1.0,0.0,0.0
14,1.017181,1.269199,0.375742,0.0,1.0,0.0


In [34]:
ytest.head()

Unnamed: 0,PROFIT
7,155752.6
44,65200.33
43,69758.98
25,107404.34
14,132602.65


In [35]:
xtrain.shape

(40, 6)

In [36]:
xtest.shape

(10, 6)

Step 6: Build the model

In [37]:
from sklearn.tree import DecisionTreeRegressor
model1 = DecisionTreeRegressor(
    max_depth=1,
    min_samples_split=5,
    min_samples_leaf=5,
    criterion="absolute_error" 
)
model1.fit(xtrain, ytrain)

In [38]:
model1.score(xtrain, ytrain)

0.6019636534994364

In [39]:
model1.score(xtest, ytest)

0.5746278420052591

Hyperparameter tuning with GridSearchCV

In [40]:
params = {
    "max_depth" : [2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
    "min_samples_split" : [5, 6, 7, 8, 9, 10],
    "min_samples_leaf" : [5, 6, 7, 8, 9, 10],
    "criterion" : ["squared_error", "absolute_error"]
}

In [41]:
from sklearn.model_selection import GridSearchCV
dtr = DecisionTreeRegressor()
gscv = GridSearchCV(dtr, param_grid=params, cv=3, scoring='r2')
gscv.fit(xtrain, ytrain)

In [42]:
gscv.best_params_

{'criterion': 'absolute_error',
 'max_depth': 4,
 'min_samples_leaf': 5,
 'min_samples_split': 6}

In [43]:
gscv.best_score_

0.7200499981888969

In [44]:
best_dtr = gscv.best_estimator_
best_dtr

Step 7: Model evaluation

In [45]:
best_dtr.score(xtrain, ytrain)

0.9194209542324717

In [46]:
best_dtr.score(xtest, ytest)

0.9202015467471865