In [57]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm

# Load data
data = pd.read_csv('Otherdata/bank-additional-full.csv', delimiter=';')

# Select specific columns for the model
selected_columns = ['age', 'job', 'education', 'marital', 'housing', 'loan', 'cons.price.idx', 'campaign', 'y']
selected_data = data[selected_columns]

# Encode categorical variables
categorical_cols = ['job', 'education', 'marital', 'housing', 'loan']  # List of categorical columns excluding 'y'
encoder = OneHotEncoder(drop='first')
encoded_categorical = encoder.fit_transform(selected_data[categorical_cols]).toarray()
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))

# Combine numerical and encoded categorical features
numerical_features = selected_data[['age', 'cons.price.idx', 'campaign']]  # Explicitly list numerical features
combined_features = pd.concat([numerical_features, encoded_categorical_df], axis=1)

# Prepare target variable by converting 'y' to binary
target = selected_data['y'].apply(lambda x: 1 if x == 'yes' else 0)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, target, test_size=0.3, random_state=42)

# Add constant to training data for statsmodels
X_train_sm = sm.add_constant(X_train)

# Fit logistic regression model with a method that includes regularization
model = sm.Logit(y_train, X_train_sm)
result = model.fit(method='lbfgs', maxiter=500)  # Increased maxiter and changed method to 'lbfgs'

# Print the model summary
print(result.summary())


                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                28831
Model:                          Logit   Df Residuals:                    28803
Method:                           MLE   Df Model:                           27
Date:                Mon, 15 Apr 2024   Pseudo R-squ.:                 0.04185
Time:                        19:11:50   Log-Likelihood:                -9730.6
converged:                       True   LL-Null:                       -10156.
Covariance Type:            nonrobust   LLR p-value:                1.093e-161
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                             0.2029      3.046      0.067      0.947      -5.768       6.174
age                               0.0102      0.002      4.419      0.000 