In [83]:
#import important libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [84]:
#read dataset csv file.
df = pd.read_csv("bank.csv", delimiter=";", header='infer')

In [85]:
# find the features and observation in the data set.
df.shape

(4521, 17)

In [86]:
# find the datatypes of the features and target variable. 
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [87]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [88]:
bank_data_new = pd.get_dummies(df, columns=['job', 'marital', 'education', 'default', 'housing', 'loan',
                                                   'contact', 'month', 'poutcome'])

bank_data_new.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,30,1787,19,79,1,-1,0,no,0,0,...,0,0,0,0,1,0,0,0,0,1
1,33,4789,11,220,1,339,4,no,0,0,...,0,0,1,0,0,0,1,0,0,0
2,35,1350,16,185,1,330,1,no,0,0,...,0,0,0,0,0,0,1,0,0,0
3,30,1476,3,199,4,-1,0,no,0,0,...,1,0,0,0,0,0,0,0,0,1
4,59,0,5,226,1,-1,0,no,0,1,...,0,0,1,0,0,0,0,0,0,1


In [89]:
bank_data_new.y.replace(('yes', 'no'), (1, 0), inplace=True)


In [90]:
data_y = pd.DataFrame(bank_data_new['y'])


In [91]:
data_X = bank_data_new.drop(['y'], axis=1)

In [92]:
#Using Python Code
class LinearTrainer:

    def __init__(self):

        # Learning Rate
        self.l_rate = 0.0001

        # Total iterations
        self.iterations = 20000

    def trains(self, x_data_train, y_data_train, theta_vector):

        x_data_train = np.column_stack((np.ones((x_data_train.shape[0], 1), dtype=float), x_data_train))

        for i in range(self.iterations):
            z = np.dot(x_data_train, theta_vector)
            sigmoid = (1 / (1 + np.exp(-z)))
            a = sigmoid - y_data_train
            temp = np.dot( x_data_train.T, a)
            temp = np.dot(temp, self.l_rate) / len(x_data_train)
            theta_vector = theta_vector - temp
        return theta_vector

    def classify(self, x_data_test, theta_vector):

        y_prediction = np.zeros((x_data_test.shape[0], 1), dtype=float)
        x_data_test = np.column_stack((np.ones((x_data_test.shape[0], 1)), x_data_test))
        z = np.dot(x_data_test, theta_vector)
        sigmoid = np.array(1 / (1 + np.exp(-z)))
        for i in (range(0, len(sigmoid))):
            if round(sigmoid[i][0], 2) <= 0.5:
                y_prediction[i][0] = 0
            else:
                y_prediction[i][0] = 1
        return y_prediction

    def accuracy(self, y_data_test, y_pred_test):
        total_error = 0
        error = []
        for i in range(0, len(y_data_test)):
            total_error = total_error + abs((y_pred_test[i] - y_data_test[i]) / y_data_test[i])
        total_error = (total_error / len(y_data_test))
        accuracy = 1 - total_error
        return accuracy * 100

    def plotgraph(self, x_data_test, y_data_test, y_pred, x_data_train, y_data_train):

        plt.scatter(x_data_train, y_data_train, color='g', label='Train Data Set')
        plt.plot(x_data_test, y_pred, color='r', label='Predicted Values')
        plt.legend()
        plt.show()
        plt.scatter(x_data_test, y_data_test, color='g', label='Train Data Set')
        plt.plot(x_data_test, y_pred, color='r', label='Predicted Values')
        plt.legend()
        plt.show()

   
def main():
    l_t = LinearTrainer()
    x_data_train, x_data_test, y_data_train, y_data_test = train_test_split(data_X, data_y
                                                                            , test_size=0.3, random_state=0)

    theta_vector = np.zeros(((x_data_train.shape[1] + 1), 1), dtype='f')

    parameters = l_t.trains(x_data_train, y_data_train, theta_vector)
    y_prediction_test = l_t.classify(x_data_test, parameters)
    y_prediction_train = l_t.classify(x_data_train, parameters)
    
    train_acc = round(float(100 - np.mean(np.abs(y_prediction_train - y_data_train)) * 100))
    test_acc = round(float(100 - np.mean(np.abs(y_prediction_test - y_data_test)) * 100))

    print("Train accuracy:", train_acc)
    print("Test accuracy:", test_acc)


if __name__ == '__main__':
    main()



Train accuracy: 84
Test accuracy: 83


In [93]:
# Using sklearn linear regression model
model = LogisticRegression()
reg = model.fit(x_data_train,y_data_train)
reg.score(x_data_test, y_data_test)
y_pred =  reg.predict(x_data_test)
print("Train accuracy", accuracy_score(y_data_train, y_data_train) * 100)
print("Test accuracy", accuracy_score(y_data_test, y_pred) * 100)

Train accuracy 100.0
Test accuracy 89.53574060427414
