In [2]:
file_path = "data_for_premium_paid.txt"
import csv
from tools import *


data = []


def prccess_data(row):
    return [float(row[0]), float(row[1]), float(row[2])]


with open(file_path, "r") as f:
    reader = csv.reader(f, delimiter=",")
    for row in reader:
        data.append(prccess_data(row))
        
# Logistic function
def logistic(x: float) -> float:
    return 1.0 / (1 + math.exp(-x))


def logistic_prime(x: float) -> float:
    y = logistic(x)
    return y * (1 - y)


def _negative_log_likelihood(x: Vector, y: float, beta: Vector) -> float:
    if y == 1:
        return -math.log(logistic(dot(x, beta)))
    else:
        return -math.log(1 - logistic(dot(x, beta)))


def negative_log_likelihood(xs: List[Vector], ys: List[float], beta: Vector) -> float:
    return sum(_negative_log_likelihood(x, y, beta) for x, y in zip(xs, ys))


def _negative_log_partial_j(x: Vector, y: float, beta: Vector, j: int) -> float:
    return -(y - logistic(dot(x, beta))) * x[j]


def _negative_log_gradient(x: Vector, y: float, beta: Vector) -> Vector:
    return [_negative_log_partial_j(x, y, beta, j) for j in range(len(beta))]


def vector_sum(vectors: List[Vector]) -> Vector:
    num_elements = len(vectors[0])
    return [sum(vector[i] for vector in vectors) for i in range(num_elements)]


def negative_log_gradient(xs: List[Vector], ys: List[float], beta: Vector) -> Vector:
    return vector_sum([_negative_log_gradient(x, y, beta) for x, y in zip(xs, ys)])


xs = [[1.0] + row[:2] for row in data]  # [1, experience, salary]
ys = [row[2] for row in data]  # paid premium


if __name__ == "__main__":
    learning_rate = 0.000001
    rescaled_xs = rescale(xs)
    """
    print("start:")
    beta = least_squares_fit(rescaled_xs, ys,learning_rate,num_steps=10000,batch_size=20)
    print("beta:", beta)
    print(my_error_function(rescaled_xs, ys, beta))
    predictions=[predict(x,beta) for x in rescaled_xs]
    plt.scatter(predictions,ys)
    plt.xlabel("predicted")
    plt.ylabel("actual")
    plt.show()
    """






In [3]:
# Applying the model
random.seed(0)
x_train, x_test, y_train, y_test = train_test_split(rescaled_xs, ys, 0.33)
learning_rate = 0.01
beta = [random.random() for _ in range(3)]
with tqdm.trange(5000)as t:
    for epoch in t:
        gradient = negative_log_gradient(x_train, y_train, beta)
        beta = gradient_step(beta, gradient, -learning_rate)
        loss = negative_log_likelihood(x_train, y_train, beta)
        t.set_description(f"loss: {loss:.3f} beta: {beta}")
print("beta:", beta)

loss: 39.965 beta: [-2.0238042563970926, 4.680395586521536, -4.457913604878627]: 100%|██████████| 5000/5000 [00:08<00:00, 613.74it/s]  

beta: [-2.0238042563970926, 4.680395586521536, -4.457913604878627]





In [4]:
means, stdevs=scale(xs)
beta_unscaled=[beta[0]
               -beta[1]*means[1]/stdevs[1]
               -beta[2]*means[2]/stdevs[2],
               beta[1]/stdevs[1],
               beta[2]/stdevs[2]]
beta_unscaled

[8.925163958380162, 1.648026614972372, -0.00028764325691767867]

In [5]:
true_positives=false_positives=true_negatives=false_negatives=0
false_negatives = 0

for x_i, y_i in zip(x_test, y_test):
    prediction = logistic(dot(beta, x_i))
    if y_i == 1 and prediction >= 0.5:
        true_positives += 1
    elif y_i == 1 and prediction < 0.5:
        false_negatives += 1
    elif y_i == 0 and prediction < 0.5:
        true_negatives += 1
    elif y_i == 0 and prediction >= 0.5:
        false_positives += 1
precision=get_precision(true_positives,false_positives,false_negatives,true_negatives)
recall=get_recall(true_positives,false_positives,false_negatives,true_negatives)
precision,recall

(0.75, 0.8)

In [7]:
predictions=[logistic(dot(beta, x_i)) for x_i in x_test]
plt.scatter(predictions,y_test)

[0.06985848081186027,
 0.06566095152214751,
 0.24987785176913332,
 0.983290312308836,
 0.6063679439951916,
 0.013555757883615578,
 0.29253149125130146,
 0.005267463969470931,
 0.03270207298512329,
 0.7537639359449452,
 0.833727219413262,
 0.775827921480921,
 0.007085804799438884,
 0.0006203872830060474,
 0.04338138356178208,
 0.001389066184521954,
 0.02568701850813825,
 0.0005614863913632178,
 0.1138849866006537,
 0.023555902894181635,
 0.1511941614449637,
 0.08933422458913565,
 0.09333701374386985,
 0.0001556717533126062,
 0.07305863097720988,
 0.010740802463219689,
 0.45588141187831094,
 0.030834999261163197,
 0.07127627752870472,
 0.12408376724252357,
 0.5979428044976739,
 0.7658919378730709,
 1.8993848585776147e-05,
 0.13843339682657926,
 0.5583140497654323,
 0.15689875708974138,
 0.9642550019683719,
 0.12330856830690173,
 0.002501907550473461,
 0.11776218956481337,
 0.003919590296365597,
 0.8313576208368463,
 0.0028311618077155425,
 0.009537495238909477,
 0.20846089476170027,
 0.0