In [1]:
import numpy as np
import pandas as pd

file_path1 = 'C:/temp/X1.csv'
file_path2 = 'C:/temp/y1.csv'
file_path3 = 'C:/temp/X2.csv'
file_path4 = 'C:/temp/y2.csv'

X1 = pd.read_csv(file_path1);y1 = pd.read_csv(file_path2)
X2 = pd.read_csv(file_path3);y2 = pd.read_csv(file_path4)
X_train = np.array(X1);X_test = np.array(X2)
y_train = np.array(y1).flatten();y_test = np.array(y2).flatten()

In [3]:
# numpy only data peek

def data_info(data):
    missing_values = np.isnan(data).sum()
    size_in_mb = data.nbytes / (1024 ** 2)
    if len(data.shape) > 1:
        mean = data.mean()
        sd = data.std()
    else:
        mean = np.mean(data)
        sd = np.std(data)
    return missing_values, size_in_mb, mean, sd

def data_summary(X_train, y_train, X_test, y_test):
    header = "+------------+-------------------------+------------------+--------------+---------+-----------+------------+"
    print(header)
    print("| Data       | Type                    | Shape            | MissingVals  | Size(MB)| Mean      | SD         |")
    print(header)

    for name, data in [("X_train", X_train), ("y_train", y_train), ("X_test", X_test), ("y_test", y_test)]:
        missing_values, size_in_mb, mean, sd = data_info(data)
        print("| {:<9} | {:<24} | {:<16} | {:<12} | {:<7.2f} | {:<9.2f} | {:<10.2f} |".format(
            name, str(type(data)), str(data.shape), missing_values, size_in_mb, mean, sd))
    print(header)


data_summary(X_train, y_train, X_test, y_test)

+------------+-------------------------+------------------+--------------+---------+-----------+------------+
| Data       | Type                    | Shape            | MissingVals  | Size(MB)| Mean      | SD         |
+------------+-------------------------+------------------+--------------+---------+-----------+------------+
| X_train   | <class 'numpy.ndarray'>  | (10400792, 3)    | 0            | 238.06  | -0.00     | 0.99       |
| y_train   | <class 'numpy.ndarray'>  | (10400792,)      | 0            | 79.35   | 0.66      | 0.47       |
| X_test    | <class 'numpy.ndarray'>  | (2764768, 3)     | 0            | 63.28   | 0.01      | 1.04       |
| y_test    | <class 'numpy.ndarray'>  | (2764768,)       | 0            | 21.09   | 0.64      | 0.48       |
+------------+-------------------------+------------------+--------------+---------+-----------+------------+


In [22]:
# Metric Stuff

def confusion_matrix(y_true, y_pred):
    TP = np.sum((y_true == 1) & (y_pred == 1))
    TN = np.sum((y_true == 0) & (y_pred == 0))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))
    return TP, TN, FP, FN

def accuracy(TP, TN, FP, FN):
    return (TP + TN) / (TP + TN + FP + FN)

def specificity(TN, FP):
    return TN / (TN + FP)

def sensitivity(TP, FN):
    return TP / (TP + FN)

def f1_score(TP, FP, FN):
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    return 2 * (precision * recall) / (precision + recall)

def print_confusion_matrix(TP, TN, FP, FN):
    head_cm = "+-----------------+------------+------+"
    print(head_cm)
    print("|                 | Predicted        |")
    print("| Actual          |      0    |   1  |")
    print(head_cm)
    print("| 0               | {:<5} | {:<5}   |".format(TN, FP))
    print("| 1               | {:<5} | {:<5}  |".format(FN, TP))
    print(head_cm)
    print()

def metrics(y_true, y_pred):
    TP, TN, FP, FN = confusion_matrix(y_true, y_pred)
    acc = accuracy(TP, TN, FP, FN)
    spec = specificity(TN, FP)
    sens = sensitivity(TP, FN)
    f1 = f1_score(TP, FP, FN)

    print_confusion_matrix(TP, TN, FP, FN)
    head = "+--------------+------------+"
    print(head)
    print("| Metric       | Value      |")
    print(head)
    print("| Accuracy     | {:10.4f} |".format(acc))
    print("| Specificity  | {:10.4f} |".format(spec))
    print("| Sensitivity  | {:10.4f} |".format(sens))
    print("| F1 Score     | {:10.4f} |".format(f1))
    print(head)
   

In [4]:
# Binary target: logisitic regression which is acutally classificaiton.

class LogisticRegression:

    def __init__(self, lr=0.01, num_iter=1000):
        self.lr = lr
        self.num_iter = num_iter

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        # initialization of weights
        self.theta = np.zeros(X.shape[1])
        self.bias = 0

        for i in range(self.num_iter):
            model = np.dot(X, self.theta) + self.bias
            predictions = self.sigmoid(model)

            # compute gradient
            d_theta = (1 / len(X)) * np.dot(X.T, (predictions - y))
            d_bias = (1 / len(X)) * np.sum(predictions - y)

            # update my weights
            self.theta -= self.lr * d_theta
            self.bias -= self.lr * d_bias

    def predict_prob(self, X):
        return self.sigmoid(np.dot(X, self.theta) + self.bias)

    def predict(self, X, threshold=0.5):
        return (self.predict_prob(X) >= threshold).astype(int)

In [24]:
# logistic Execution & metric call
model_logreg = LogisticRegression(lr=0.01, num_iter=1000)
model_logreg.fit(X_train, y_train)
y_pred_logreg = model_logreg.predict(X_test)
metrics(y_test, y_pred_logreg)

+-----------------+------------+------+
|                 | Predicted        |
| Actual          |      0    |   1  |
+-----------------+------------+------+
| 0               | 752363 | 229976   |
| 1               | 301596 | 1480833  |
+-----------------+------------+------+

+--------------+------------+
| Metric       | Value      |
+--------------+------------+
| Accuracy     |     0.8077 |
| Specificity  |     0.7659 |
| Sensitivity  |     0.8308 |
| F1 Score     |     0.8478 |
+--------------+------------+


In [28]:
def compute_cost(X, y, theta):
    m = len(y)
    cost = (1/2*m) * np.sum(np.square(X.dot(theta) - y))
    return cost

def gradient_descent(X, y, theta, learning_rate, iterations):
    m = len(y)
    cost_history = np.zeros(iterations)
    
    for i in range(iterations):
        theta = theta - (1/m) * learning_rate * (X.T.dot(X.dot(theta) - y))
        cost_history[i] = compute_cost(X, y, theta)
    
    return theta, cost_history

# initial coeffx w/intercept)
theta = np.zeros(X_train.shape[1])

# abritrary gradient-descent settings
iters = 200
lr = 0.01

# Run gradient-descent
theta, cost_history = gradient_descent(X_train, y_train, theta, lr, iters)

pred = X_test.dot(theta)
mae = np.mean(np.abs(pred - y_test))
mse = np.mean((pred - y_test)**2)
rmse = np.sqrt(np.mean((pred - y_test)**2))
ss_res = np.sum((pred - y_test)**2)
ss_tot = np.sum((y_test - np.mean(y_test))**2)
r2 = 1 - (ss_res / ss_tot)

print("mean squared error: ",mse*100)

mean squared error:  58.75265901716664
