In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm

# Load data
data = pd.read_csv('Otherdata/bank-additional-full.csv', delimiter=';')

# Select specific columns for the model
selected_columns = ['age', 'job', 'education', 'marital', 'housing', 'loan', 'cons.price.idx', 'campaign', 'y']
selected_data = data[selected_columns]

# Encode categorical variables
categorical_cols = ['job', 'education', 'marital', 'housing', 'loan']  # List of categorical columns excluding 'y'
encoder = OneHotEncoder(drop='first')
encoded_categorical = encoder.fit_transform(selected_data[categorical_cols]).toarray()
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))


# Combine numerical and encoded categorical features
numerical_features = selected_data[['age', 'cons.price.idx', 'campaign']]  # Explicitly list numerical features
combined_features = pd.concat([numerical_features, encoded_categorical_df], axis=1)

# Prepare target variable by converting 'y' to binary
target = selected_data['y'].apply(lambda x: 1 if x == 'yes' else 0)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, target, test_size=0.3, random_state=42)

# Add constant to training data for statsmodels
X_train_sm = sm.add_constant(X_train)

# Fit logistic regression model with a method that includes regularization
model = sm.Logit(y_train, X_train_sm)
result = model.fit(method='lbfgs', maxiter=500)  # Increased maxiter and changed method to 'lbfgs'

# Print the model summary
print(result.summary())


                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                28831
Model:                          Logit   Df Residuals:                    28803
Method:                           MLE   Df Model:                           27
Date:                Mon, 15 Apr 2024   Pseudo R-squ.:                 0.04185
Time:                        21:55:56   Log-Likelihood:                -9730.6
converged:                       True   LL-Null:                       -10156.
Covariance Type:            nonrobust   LLR p-value:                1.093e-161
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                             0.2029      3.046      0.067      0.947      -5.768       6.174
age                               0.0102      0.002      4.419      0.000 

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm

# Load data
data = pd.read_csv('Otherdata/bank-additional-full.csv', delimiter=';')

# Select specific columns for the model
selected_columns = ['age', 'job', 'education', 'marital', 'housing', 'loan', 'cons.price.idx', 'campaign', 'y']
selected_data = data[selected_columns]

# Encode categorical variables
categorical_cols = ['job', 'education', 'marital', 'housing', 'loan']  # List of categorical columns excluding 'y'
encoder = OneHotEncoder()
encoded_categorical = encoder.fit_transform(selected_data[categorical_cols]).toarray()
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))


# Combine numerical and encoded categorical features
numerical_features = selected_data[['age', 'cons.price.idx', 'campaign']]  # Explicitly list numerical features
combined_features = pd.concat([numerical_features, encoded_categorical_df], axis=1)

combined_features = combined_features.drop(['housing_unknown', 'loan_unknown', 'job_unknown', 'education_unknown', 'marital_unknown'], axis=1).astype(int) # Drop unknown columns

# Prepare target variable by converting 'y' to binary
target = selected_data['y'].apply(lambda x: 1 if x == 'yes' else 0)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, target, test_size=0.3, random_state=42)

# Add constant to training data for statsmodels
X_train_sm = sm.add_constant(X_train)

# Fit logistic regression model with a method that includes regularization
model = sm.Logit(y_train, X_train_sm)
result = model.fit(method='lbfgs', maxiter=500)  # Increased maxiter and changed method to 'lbfgs'

# Print the model summary
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                28831
Model:                          Logit   Df Residuals:                    28803
Method:                           MLE   Df Model:                           27
Date:                Mon, 15 Apr 2024   Pseudo R-squ.:                 0.04098
Time:                        21:55:57   Log-Likelihood:                -9739.5
converged:                       True   LL-Null:                       -10156.
Covariance Type:            nonrobust   LLR p-value:                6.233e-158
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                             0.0735      2.978      0.025      0.980      -5.763       5.910
age                               0.0094      0.002      4.094      0.000 

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import statsmodels.api as sm

# Load data
data = pd.read_csv('Otherdata/bank-additional-full.csv', delimiter=';')

# Select specific columns for the model
selected_columns = ['age', 'job', 'education', 'marital', 'housing', 'loan', 'cons.price.idx', 'campaign', 'y']
selected_data = data[selected_columns]

# Define categorical columns
categorical_cols = ['job', 'education', 'marital', 'housing', 'loan']  # List of categorical columns excluding 'y'

# Initialize encoder with specifying to drop the first category in each feature
encoder = OneHotEncoder(drop='first')
encoded_categorical = encoder.fit_transform(selected_data[categorical_cols]).toarray()
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))

# Combine numerical and encoded categorical features
numerical_features = selected_data[['age', 'cons.price.idx', 'campaign']]  # Explicitly list numerical features
combined_features = pd.concat([numerical_features, encoded_categorical_df], axis=1)

# Prepare target variable by converting 'y' to binary
target = selected_data['y'].apply(lambda x: 1 if x == 'yes' else 0)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, target, test_size=0.3, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Fit logistic regression model using statsmodels (for detailed stats) or scikit-learn (for simplicity)
model = sm.Logit(y_train_smote, sm.add_constant(X_train_smote))
result = model.fit(method='lbfgs', maxiter=500)

# Alternatively, use scikit-learn logistic regression if detailed statistics are not needed
# lr = LogisticRegression(max_iter=500)
# lr.fit(X_train_smote, y_train_smote)

# Predict on test data
# y_pred = lr.predict(X_test)

# Print model summary
print(result.summary())

# Evaluate the model
# print(classification_report(y_test, y_pred))
# print(f"Accuracy: {accuracy_score(y_test, y_pred)}")


                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                51160
Model:                          Logit   Df Residuals:                    51132
Method:                           MLE   Df Model:                           27
Date:                Mon, 15 Apr 2024   Pseudo R-squ.:                 0.07370
Time:                        21:55:59   Log-Likelihood:                -32848.
converged:                      False   LL-Null:                       -35461.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                             2.1163      1.478      1.432      0.152      -0.780       5.013
age                               0.0070      0.001      6.108      0.000 

