In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset
file_path = "C:\\Users\\ashwi\\Downloads\\Preliminary college year.xlsx"
df = pd.read_excel(file_path)

# Remove rows with missing target variable
df.dropna(subset=['Retained F17-F18? (1=yes, 0=no)'], inplace=True)

# Drop unnecessary columns
columns_to_drop = ['Federal Ethnic Group','Gender','Reason for not Completing Connect', 'Reason not Retained']
df.drop(columns=columns_to_drop, inplace=True)

# Define predictor variables (V) and target variable (TV)
V = df.drop(columns=['Retained F17-F18? (1=yes, 0=no)'])
TV = df['Retained F17-F18? (1=yes, 0=no)']

# Perform one-hot encoding for categorical variables
categorical_cols = V.select_dtypes(include=['object']).columns
V_encoded = pd.get_dummies(V, columns=categorical_cols, drop_first=True)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
V_imputed = pd.DataFrame(imputer.fit_transform(V_encoded), columns=V_encoded.columns)

# Scale the features
scaler = StandardScaler()
V_scaled = scaler.fit_transform(V_imputed)

# Split the data into training and testing sets
V_train, V_test, TV_train, TV_test = train_test_split(V_scaled, TV, test_size=0.2, random_state=42)

# Define pipeline with preprocessor and logistic regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', max_iter=1000, random_state=42))
])

# Define hyperparameters for logistic regression
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

# Train logistic regression model with hyperparameter tuning
logistic_model = LogisticRegression(solver='liblinear', max_iter=1000, random_state=42)

# Define hyperparameters for grid search, allowing both 'l1' and 'l2' penalties
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

# Train logistic regression model with hyperparameter tuning
grid_search = GridSearchCV(logistic_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(V_train, TV_train)

# Get the best model from the grid search
best_logistic_model = grid_search.best_estimator_

# Predict target variable for the testing set
TV_pred = best_logistic_model.predict(V_test)

# Calculate accuracy
accuracy = accuracy_score(TV_test, TV_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9545454545454546
