In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load and preprocess dataset
def load_data(file_path):
    data = pd.read_excel(file_path)
    data.dropna(subset=['Retained F17-F18? (1=yes, 0=no)'], inplace=True)
    data.drop(columns=['Federal Ethnic Group', 'Gender', 'Reason for not Completing Connect', 'Reason not Retained'], inplace=True)
    return data

# Prepare features and target
def prepare_features_target(data):
    X = data.drop(columns=['Retained F17-F18? (1=yes, 0=no)'])
    y = data['Retained F17-F18? (1=yes, 0=no)']
    X_encoded = pd.get_dummies(X, drop_first=True)
    return X_encoded, y

# Preprocess data
def preprocess_data(X):
    imputer = SimpleImputer(strategy='mean')
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)
    return X_scaled

# Split data
def split_data(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Perform grid search for logistic regression
def perform_grid_search(X_train, y_train):
    logistic_clf = LogisticRegression(max_iter=1000, random_state=42)
    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
    grid_search = GridSearchCV(logistic_clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# Main execution
def main():
    file_path = "/Users/charishyadavali/Downloads/Preliminary college year.xlsx"

    data = load_data(file_path)
    X, y = prepare_features_target(data)
    X_processed = preprocess_data(X)
    X_train, X_test, y_train, y_test = split_data(X_processed, y)
    best_model = perform_grid_search(X_train, y_train)
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

# Run the main function
main()


Accuracy: 0.9545454545454546


30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/charishyadavali/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/charishyadavali/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/charishyadavali/anaconda3/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual