In [None]:
from sklearn.datasets import load_iris
import pandas as pd
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold,cross_val_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split

iris=load_iris()
X=iris.data
y=iris.target
dt=DecisionTreeClassifier()
kfold=KFold(n_splits=5,shuffle=True,random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scores=cross_val_score(dt,X,y,cv=kfold,scoring="accuracy")

dt_gini = DecisionTreeClassifier(criterion='gini', max_depth=3)
dt_gini.fit(X_train, y_train)
y_pred = dt_gini.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Cross-validation scores:",scores)
print("Mean accuracy:",np.mean(scores))
print("Standard deviation of accuracy:",np.std(scores))

Accuracy: 1.0
Cross-validation scores: [1.         0.96666667 0.93333333 0.93333333 0.93333333]
Mean accuracy: 0.9533333333333335
Standard deviation of accuracy: 0.02666666666666666


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
model = DecisionTreeRegressor(max_depth=5)
scores=cross_val_score(model,X,y,cv=5,scoring="neg_mean_squared_error")
mse=-scores.mean()
print("Mean MSE:",mse)

Mean MSE: 0.09333333333333334


In [None]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV,KFold
iris=load_iris()
X=iris.data
y=iris.target
dt = DecisionTreeClassifier(random_state=42)
param_grid = {'max_depth': [2, 3, 4, 5, 6, None],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'criterion':['gini','entropy']}
kfold=KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=dt,param_grid=param_grid,cv=kfold,scoring='accuracy',n_jobs=-1)
grid_search.fit(X, y)
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

Best hyperparameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best cross-validation accuracy: 0.9666666666666668


In [6]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

train_df = pd.read_csv('train.csv')
y=train_df['SalePrice']
X=train_df.drop(columns=['SalePrice', 'Id'])


cat_cols = X.select_dtypes(include=['object']).columns

num_cols = X.select_dtypes(exclude=['object']).columns

X[num_cols]=X[num_cols].fillna(X[num_cols].median())
X[cat_cols]=X[cat_cols].fillna(X[cat_cols].mode().iloc[0])

preprocessor = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='ignore'),cat_cols),('num', 'passthrough', num_cols)])
dt_reg = DecisionTreeRegressor(random_state=42)
param_grid = {'model__max_depth': [3, 5, 7, 10, None],
              'model__min_samples_split': [2, 5, 10, 20],
              'model__min_samples_leaf': [1, 2, 5, 10]}
pipeline = Pipeline([('preprocess', preprocessor),('model', dt_reg)])
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=pipeline,param_grid=param_grid,scoring='neg_root_mean_squared_error',cv=kfold,verbose=2,n_jobs=-1)

grid_search.fit(X, y)
print("Best hyperparameters:",grid_search.best_params_)
print("Best CV RMSE:",-grid_search.best_score_)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
Best hyperparameters: {'model__max_depth': None, 'model__min_samples_leaf': 2, 'model__min_samples_split': 20}
Best CV RMSE: 37842.158063029936


In [None]:
Excercise

In [7]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold,cross_val_score

X,y=load_iris(return_X_y=True)
kfold=KFold(n_splits=5,shuffle=True,random_state=42)
depths=[1,2,3,4,5,6,None]

for d in depths:
    dt=DecisionTreeClassifier(max_depth=d,criterion='gini',random_state=42)
    scores=cross_val_score(dt,X,y,cv=kfold,scoring='accuracy')
    print("Depth:",d,"--Mean cross validation accuracy:",scores.mean())


Depth: 1 --Mean cross validation accuracy: 0.6333333333333333
Depth: 2 --Mean cross validation accuracy: 0.9466666666666667
Depth: 3 --Mean cross validation accuracy: 0.9533333333333334
Depth: 4 --Mean cross validation accuracy: 0.9533333333333335
Depth: 5 --Mean cross validation accuracy: 0.9533333333333335
Depth: 6 --Mean cross validation accuracy: 0.9533333333333335
Depth: None --Mean cross validation accuracy: 0.9533333333333335


In [9]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df=pd.read_csv("train.csv")
y=df["SalePrice"]
X=df.drop(columns=["SalePrice","Id"])

cat_cols=X.select_dtypes(include="object").columns
num_cols=X.select_dtypes(exclude="object").columns

X[num_cols]=X[num_cols].fillna(X[num_cols].median())
X[cat_cols]=X[cat_cols].fillna(X[cat_cols].mode().iloc[0])

preprocess=ColumnTransformer([
("cat",OneHotEncoder(handle_unknown="ignore"),cat_cols),
("num","passthrough",num_cols)
])

depths=[3,5,7,10,None]

for d in depths:
    model=Pipeline([
    ("preprocess",preprocess),
    ("model",DecisionTreeRegressor(max_depth=d,random_state=42))
    ])
    scores=cross_val_score(model,X,y,cv=5,scoring="neg_mean_squared_error")
    mse=-scores.mean()
    print("Depth:",d,"--Mean MSE:",mse)

Depth: 3 --Mean MSE: 1973952575.8652425
Depth: 5 --Mean MSE: 1668747298.4672348
Depth: 7 --Mean MSE: 1536388155.1308594
Depth: 10 --Mean MSE: 1658014764.1185594
Depth: None --Mean MSE: 1819023118.4561641


In [5]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df=pd.read_csv("titanic.csv")
y=df["Survived"]
X=df.drop(columns=["Survived","PassengerId","Name","Ticket","Cabin"])

cat_cols=X.select_dtypes(include="object").columns
num_cols=X.select_dtypes(exclude="object").columns

preprocess=ColumnTransformer([
("cat",OneHotEncoder(handle_unknown="ignore"),cat_cols),
("num","passthrough",num_cols)
])

pipeline=Pipeline([
("preprocess",preprocess),
("model",DecisionTreeClassifier(random_state=42))
])

param_grid={
"model__max_depth":[3,5,7,10,None],
"model__min_samples_split":[2,5,10],
"model__min_samples_leaf":[1,2,5],
"model__criterion":["gini","entropy"]
}

kfold=KFold(n_splits=5,shuffle=True,random_state=42)

grid=GridSearchCV(pipeline,param_grid,cv=kfold,scoring="accuracy",n_jobs=-1)
grid.fit(X,y)

best_model=grid.best_estimator_

predictions=best_model.predict(X)

df["Predicted_Survived"]=predictions

print(df[["PassengerId","Survived","Predicted_Survived"]].head(10))
print("Best hyperparameters:",grid.best_params_)
print("Best CV accuracy:",grid.best_score_)

   PassengerId  Survived  Predicted_Survived
0            1         0                   0
1            2         1                   1
2            3         1                   1
3            4         1                   1
4            5         0                   0
5            6         0                   0
6            7         0                   0
7            8         0                   0
8            9         1                   1
9           10         1                   1
Best hyperparameters: {'model__criterion': 'gini', 'model__max_depth': 3, 'model__min_samples_leaf': 5, 'model__min_samples_split': 2}
Best CV accuracy: 0.817079907099366


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
df = pd.read_csv("adult.csv")

df.head()
df.shape
df = df.replace("?", np.nan)

df = df.dropna()
X = df.drop("income", axis=1)
y = df["income"]
categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(exclude=["object"]).columns
categorical_encoder = OneHotEncoder(handle_unknown="ignore")

preprocess = ColumnTransformer(
    transformers=[
        ("cat", categorical_encoder, categorical_cols),
        ("num", "passthrough", numerical_cols),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dt = DecisionTreeClassifier(random_state=42)

param_grid = {
    "classifier__criterion": ["gini", "entropy"],
    "classifier__max_depth": [10, 15, 20],
    "classifier__min_samples_split": [2, 5],
}
pipe = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("classifier", dt)
    ]
)

grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Best Params:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))

Best Parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__min_samples_split': 2}
Best CV Score: 0.8504703883293963
Best Params: {'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__min_samples_split': 2}
Accuracy: 0.849494447207028
