## Individual Project 
Andrew Chen

In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier


df = pd.read_csv("amazon_employee_access_train.csv")

X = df.drop(["ACTION"], axis = 1)
y = df[["ACTION"]] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)


# Set up the parameter grid
param_grid = {
    'max_depth': [1, 3, 5, 6, None],  # Range of depths to test
    'min_samples_split': [2, 4, 6, 8],  # Different minimum samples per split
    'max_features': [None, 'sqrt', 'log2']  # Various options for max features
}

# Initialize the model
tree = DecisionTreeClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=tree, param_grid=param_grid, cv=5, scoring='recall')

# Fit GridSearchCV
grid_search.fit(X_train_std, y_train)

# Print the best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# Evaluate on the test set
best_tree = grid_search.best_estimator_
test_score = best_tree.score(X_test_std, y_test)
print("Test set recall: %f" % test_score)

Best parameters found:  {'max_depth': 1, 'max_features': None, 'min_samples_split': 2}
Best cross-validation score:  1.0
Test set recall: 0.942122
