In [1]:
import pickle

# Load the Pickle models
with open('random_forest.pkl', 'rb') as file:
    pickle_model = pickle.load(file)

In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('transformed_df.csv')

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
0,-1.730108,-0.789272,0.827377,108,-0.592481,0.432793,-0.473674,523,-0.502445,47,1.0,0.0,1.0
1,-1.72622,1.26699,-1.566107,190,0.638789,0.432793,-0.473674,596,0.786845,81,0.0,0.0,0.0
2,-1.722332,1.26699,0.827377,353,-0.284663,-0.474545,-0.473674,669,-0.488854,47,0.0,0.0,1.0
3,-1.718444,1.26699,-1.566107,272,0.407926,0.432793,-0.473674,49,0.42073,55,0.0,0.0,1.0
4,-1.714556,-0.789272,0.827377,15,0.407926,-0.474545,-0.473674,472,-0.486337,47,1.0,0.0,1.0


In [24]:
df['Survived'].apply(lambda x: 1 if x > 0 else 0)

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [3]:
df['Survived'] = df['Survived'].apply(lambda x: 1 if x > 0 else 0)
df['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [5]:
features = ['Pclass', 'Ticket', 'Fare', 'Cabin', 'Sex_male']
target = 'Survived'
X = df[features]
y = df[target]

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Make predictions with the Pickle model
pickle_predictions = pickle_model.predict(X_test)

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Evaluate the Pickle model
pickle_accuracy = accuracy_score(y_test, pickle_predictions)
pickle_precision = precision_score(y_test, pickle_predictions, average='weighted')
pickle_recall = recall_score(y_test, pickle_predictions, average='weighted')
pickle_f1 = f1_score(y_test, pickle_predictions, average='weighted')
pickle_confusion = confusion_matrix(y_test, pickle_predictions)

# Print the metrics
print("Pickle Model Accuracy:", pickle_accuracy)
print("Pickle Model Precision:", pickle_precision)
print("Pickle Model Recall:", pickle_recall)
print("Pickle Model F1 Score:", pickle_f1)
print("Pickle Model Confusion Matrix:\n", pickle_confusion)

Pickle Model Accuracy: 0.8324022346368715
Pickle Model Precision: 0.8324681531286563
Pickle Model Recall: 0.8324022346368715
Pickle Model F1 Score: 0.8306889154103546
Pickle Model Confusion Matrix:
 [[94 11]
 [19 55]]


In [33]:
pickle_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [34]:
# Define the hyperparameters grid to search over, refined based on the original model's parameters
param_grid = {
    'n_estimators': [50, 100, 150, 200],          # Expand around 100
    'max_features': ['sqrt', 'log2', None],       # Original value was 'sqrt'
    'max_depth': [None, 10, 20, 30, 40],          # Include None and specific depths
    'min_samples_split': [2, 5, 10],              # Start from original (2) and expand
    'min_samples_leaf': [1, 2, 4],                # Include original (1) and some higher values
    'bootstrap': [True, False]                    # Include both options for exploration
}


In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
grid_search = GridSearchCV(estimator=pickle_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X, y)
print("Best Parameters found:", grid_search.best_params_)


Fitting 5 folds for each of 1080 candidates, totalling 5400 fits
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimator

  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters found: {'bootstrap': False, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}


In [39]:
# Evaluate the best model
best_model = grid_search.best_estimator_
print("Best Model Performance on Training Set:", best_model.score(X, y))

Best Model Performance on Training Set: 0.9562289562289562


In [40]:
best_model.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 150,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo optimize the Random Forest model using hyperparameter tuning and create a detailed report, I will follow these steps:

1. **Access the Global Variables**:
   - Load the `features` array and the `dataset` to ensure they are available for the tuning process.

2. **Perform Hyperparameter Tuning**:
   - Use the `hyperparameter_tune_rf` tool to optimize the Random Forest model.
   - Focus on key hyperparameters such as `n_estimators`, `max_depth`, `min_samples_split`, and `min_samples_leaf`.

3. **Print Output Analysis**:
   - Monitor the output generated by the tuning tool to observe changes in model performance metrics.

4. **Create a Report**:
   - Document the initial hyperparameters, the tuning process, and the results.
   - Summarize the findings and provide recommendations for the final model configuration.

Let's start by loading the `features` array and the `dataset`.

### Step 1: Access the Global Variables

Action: L

'Agent stopped due to iteration limit or time limit.'