In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.feature_selection import SelectFromModel

# Load the 'bank.csv' dataset into a pandas DataFrame
df = pd.read_csv("bank.csv", sep=";")

# Convert the "unknown" values to actual missing values(NaNs) on the 'contact' and 'poutcome' column
df['contact'] = df['contact'].replace('unknown', np.nan)
df['poutcome'] = df['poutcome'].replace('unknown', np.nan)

# Handle the missing value by dropping the entire 'contact' column
df = df.drop('contact', axis=1)

# Handle the missing value by replacing the most frequent value (mode) in 'poutcome' column
df['poutcome'].fillna(df['poutcome'].mode()[0], inplace=True)

# Drop unnecessary columns
df.drop(['day', 'month', 'duration', 'pdays', 'previous'], axis=1, inplace=True)

# Convert categorical variables to dummy variables
df = pd.get_dummies(df, columns=['job', 'marital', 'education', 'default', 'housing', 'loan', 'poutcome'])

# Define feature and target variables
X = df.drop('y', axis=1)
y = df['y']

# Update label encoding for the target variable
y = y.map({'no': 0, 'yes': 1})  # Update this based on the label encoding in your dataset

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a Logistic Regression model
lr = LogisticRegression(max_iter=1000)
# Fit the model on training data
lr.fit(X_train, y_train)
# Predict the target variable for test data
y_pred_lr = lr.predict(X_test)

# Perform feature selection using L1 regularization
feature_selector = SelectFromModel(lr, threshold='median')
feature_selector.fit(X_train, y_train)

# Get selected feature indices
selected_feature_indices = feature_selector.get_support(indices=True)

# Subset the training and testing data with selected features
X_train_selected = X_train.iloc[:, selected_feature_indices]
X_test_selected = X_test.iloc[:, selected_feature_indices]

# Fit the Logistic Regression model with selected features
lr_selected = LogisticRegression(max_iter=1000)
lr_selected.fit(X_train_selected, y_train)
y_pred_lr_selected = lr_selected.predict(X_test_selected)

# Evaluate the model with selected features
accuracy_selected = accuracy_score(y_test, y_pred_lr_selected)



In [9]:
import pickle

# Save the trained model as a pickle file
with open('model.pkl', 'wb') as file:
    pickle.dump(lr, file)
# Save the feature names as a pickle file
with open('features.pkl', 'wb') as file:
    pickle.dump(X.columns.tolist(), file)