# Script should not be run. Optimized features are already included.

# Loading the data

In [6]:
import pandas as pd

X_train = pd.read_csv('data/preprocessed_data/imputed_data.csv')
X_test = pd.read_csv('data/preprocessed_data/imputed_test_data.csv')

y_train = X_train["y"]
X_train = X_train.drop('y', axis=1)

## Gradient Boosting Classifier

In [2]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [50, 200, 500],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [1, 4, 10]
}

# Create the Gradient Boosting Classifier
gb_model = GradientBoostingClassifier()

# Perform grid search
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)



Best Parameters: {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 500}
Best Score: 0.9095715587967185


## Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [40, 50, 80],
    'min_samples_split': [20, 50]
}

# Create the Random Forest Classifier
rf_model = RandomForestClassifier()

# Perform grid search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'max_depth': 40, 'min_samples_split': 20, 'n_estimators': 100}
Best Score: 0.90665451230629


# Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression


from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler
scaler = MinMaxScaler()

# Perform feature scaling on X_train
X_train_scaled = scaler.fit_transform(X_train)



# Define the logistic regression model
model = LogisticRegression()

# Define the hyperparameters to search over
param_grid = {
    'penalty': ['l1'],
    'C': [0.1, 1.0, 3.0],
    'solver': ['liblinear', 'saga'],
    "max_iter": [1000]
}

# Perform the grid search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and the corresponding accuracy score
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

Best Parameters:  {'C': 3.0, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}
Best Accuracy:  0.9038286235186872


In [18]:
from sklearn.tree import DecisionTreeClassifier

# Define the decision tree classifier
model = DecisionTreeClassifier()

# Define the hyperparameters to search over
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5],
    'min_samples_split': [3, 30, 100],
    'min_samples_leaf': [1]
}

# Perform the grid search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding accuracy score
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

Best Parameters:  {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 100}
Best Accuracy:  0.9056517775752051
