# DecisionTreeRegressor

In [430]:
import pandas as pd
import numpy as np

In [431]:
df = pd.read_csv("insurance.csv")
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [432]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [433]:
df = pd.get_dummies(df,columns=['sex','smoker','region'])
df = df.astype(int)
df.head(3)

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27,0,16884,1,0,0,1,0,0,0,1
1,18,33,1,1725,0,1,1,0,0,0,1,0
2,28,33,3,4449,0,1,1,0,0,0,1,0


In [434]:
x = df.drop(columns='charges')
y = df['charges']

In [435]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [436]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
dr = DecisionTreeRegressor()
dr.fit(x_train,y_train)
y_pred = dr.predict(x_test)
mse = mean_squared_error(y_test,y_pred)
print(f"MSE: {mse}")

mae = mean_absolute_error(y_test,y_pred)
print(f"MAE: {mae}")

r2 = r2_score(y_test,y_pred)
print(f"r2_score: {r2*100:.2f}%")

MSE: 39779114.90578358
MAE: 2950.0466417910447
r2_score: 74.38%


# tips Dataset

In [438]:
df = pd.read_csv("tips.csv")
df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3


In [439]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [440]:
df = pd.get_dummies(df,columns=['sex','smoker','day','time'])
df = df.astype(int)
df.head(3)

Unnamed: 0,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16,1,2,1,0,1,0,0,0,1,0,1,0
1,10,1,3,0,1,1,0,0,0,1,0,1,0
2,21,3,3,0,1,1,0,0,0,1,0,1,0


In [441]:
x = df.drop(columns=['tip'])
y = df['tip']

In [442]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [443]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
dr = DecisionTreeRegressor()
dr.fit(x_train,y_train)
y_pred = dr.predict(x_test)
mse = mean_squared_error(y_test,y_pred)
print(f"MSE: {mse}")

mae = mean_absolute_error(y_test,y_pred)
print(f"MAE: {mae}")

r2 = r2_score(y_test,y_pred)
print(f"r2_score: {r2*100:.2f}%")

MSE: 1.399234693877551
MAE: 0.8928571428571429
r2_score: 6.21%


# Tittanic dataset

In [445]:
df = pd.read_csv("titanic.csv")
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [446]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [447]:
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy='mean')
df['Age'] = si.fit_transform(df[['Age']])
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [448]:
df = pd.get_dummies(df,columns=['Sex'])
df = df.drop(columns=['PassengerId','Name','Cabin','Fare'])
df.head(3)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Ticket,Embarked,Sex_female,Sex_male
0,0,3,34.5,0,0,330911,Q,False,True
1,1,3,47.0,1,0,363272,S,True,False
2,0,2,62.0,0,0,240276,Q,False,True


In [449]:
df.isna().sum()

Survived      0
Pclass        0
Age           0
SibSp         0
Parch         0
Ticket        0
Embarked      0
Sex_female    0
Sex_male      0
dtype: int64

In [450]:
x = df.drop(columns=['Embarked'])
y = df['Embarked']

In [451]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,  precision_score, recall_score, confusion_matrix


dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
y_pred = dt.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

precision = precision_score(y_test,y_pred, average='weighted')
print(f"Precision: {precision*100:.2f}%")


recall = recall_score(y_test,y_pred, average='weighted')
print(f"Recall: {recall*100:.2f}%")


cm = confusion_matrix(y_test,y_pred)
print(f"Confusion Matrix: ")
print(cm)

Accuracy: 34.69%
Precision: 38.24%
Recall: 34.69%
Confusion Matrix: 
[[6 6 1 0 0 0 0]
 [5 6 5 1 2 0 0]
 [0 3 4 1 1 0 0]
 [0 1 1 0 0 0 1]
 [0 0 0 2 1 1 1]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [452]:
from sklearn.model_selection import GridSearchCV

dt = DecisionTreeClassifier(random_state=42)

param_grid = {
    "criterion": ["gini", "entropy", "log_loss"],   
    "max_depth": [None, 3, 5, 7, 10, 15],           
    "min_samples_split": [2, 5, 10],                
    "min_samples_leaf": [1, 2, 4, 6],               
}


grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    cv=5,              
    scoring="accuracy",
    n_jobs=-1
)


grid_search.fit(x_train, y_train)


best_dt = grid_search.best_estimator_


y_pred = best_dt.predict(x_test)


accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
cm = confusion_matrix(y_test, y_pred)


print("Best Parameters:", grid_search.best_params_)
print(f"Best CV Accuracy: {grid_search.best_score_*100:.2f}%\n")
print(f"Test Accuracy: {accuracy*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")
print("Confusion Matrix:\n", cm)



Best Parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best CV Accuracy: 44.62%

Test Accuracy: 44.90%
Precision: 46.96%
Recall: 44.90%
Confusion Matrix:
 [[6 6 1 0 0 0]
 [1 9 9 0 0 0]
 [0 2 7 0 0 0]
 [0 1 1 0 0 1]
 [0 2 1 0 0 2]
 [0 0 0 0 0 0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
