# Grid Search

- Titanic Dataset

### Import necessary libraries

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

### Load the Titanic dataset

In [13]:
titanic_data = pd.read_csv(
    "titanic.csv"
)  # Replace 'path_to_titanic.csv' with the actual file path

### Preprocess the dataset

In [14]:
# Drop 'Name,' 'Ticket,' and 'Cabin' columns
titanic_data = titanic_data.drop(["Name", "Ticket", "Cabin"], axis=1)

In [15]:
# Handle missing values (e.g., fill missing ages with the mean age)
titanic_data["Age"].fillna(titanic_data["Age"].mean(), inplace=True)
titanic_data["Embarked"].fillna(titanic_data["Embarked"].mode()[0], inplace=True)

titanic_data.value_counts()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data["Age"].fillna(titanic_data["Age"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data["Embarked"].fillna(titanic_data["Embarked"].mode()[0], inplace=True)


PassengerId  Survived  Pclass  Sex     Age        SibSp  Parch  Fare    Embarked
1            0         3       male    22.000000  1      0      7.250   S           1
599          0         3       male    29.699118  0      0      7.225   C           1
588          1         1       male    60.000000  1      1      79.200  C           1
589          0         3       male    22.000000  0      0      8.050   S           1
590          0         3       male    29.699118  0      0      8.050   S           1
                                                                                   ..
301          1         3       female  29.699118  0      0      7.750   Q           1
302          1         3       male    29.699118  2      0      23.250  Q           1
303          0         3       male    19.000000  0      0      0.000   S           1
304          1         2       female  29.699118  0      0      12.350  Q           1
891          0         3       male    32.000000  0      0 

In [16]:
# Encode categorical variables ('Sex' and 'Embarked')
label_encoder = LabelEncoder()
titanic_data["Sex"] = label_encoder.fit_transform(titanic_data["Sex"])
titanic_data["Embarked"] = label_encoder.fit_transform(titanic_data["Embarked"])

titanic_data["Sex"].value_counts()

Sex
1    577
0    314
Name: count, dtype: int64

In [17]:
# Define features (X) and target (y)
X = titanic_data.drop("Survived", axis=1)  # Assuming 'Survived' is the target variable
y = titanic_data["Survived"]

In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [19]:
# Create a Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf

### Hyperparameter grid to search

In [20]:
# DEFAULT # param_grid = {
#     'criterion': ['gini', 'entropy'],
#     'max_depth': [None, 10, 20, 30, 40, 50],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }


param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [15, 14, 13],
    "min_samples_split": [6,3,4],
    "min_samples_leaf": [4,2,7],
}

In [21]:
# Perform Grid Search with cross-validation (e.g., K=5)
grid_search = GridSearchCV(
    estimator=clf, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1
)
grid_search.fit(X_train, y_train)

ValueError: 
All the 135 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of DecisionTreeClassifier must be a str among {'entropy', 'gini', 'log_loss'}. Got 'global' instead.

--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of DecisionTreeClassifier must be a str among {'entropy', 'log_loss', 'gini'}. Got 'global' instead.

--------------------------------------------------------------------------------
34 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of DecisionTreeClassifier must be a str among {'log_loss', 'gini', 'entropy'}. Got 'global' instead.

--------------------------------------------------------------------------------
31 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of DecisionTreeClassifier must be a str among {'gini', 'entropy', 'log_loss'}. Got 'global' instead.

--------------------------------------------------------------------------------
16 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\KSCOM\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of DecisionTreeClassifier must be a str among {'log_loss', 'entropy', 'gini'}. Got 'global' instead.


In [85]:
# Get the best hyperparameters
best_params = grid_search.best_params_
best_params

{'criterion': 'gini',
 'max_depth': 15,
 'min_samples_leaf': 7,
 'min_samples_split': 6}

In [86]:
# Train a Decision Tree classifier with the best hyperparameters
best_clf = DecisionTreeClassifier(random_state=42, **best_params)
best_clf.fit(X_train, y_train)

In [87]:
# Evaluate the model on the test set
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [88]:
# Print the best hyperparameters and model accuracy
print(f"Best Hyperparameters: {best_params}")
print(f"Model Accuracy on Test Data: {accuracy:.2f}")

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 7, 'min_samples_split': 6}
Model Accuracy on Test Data: 0.83
