<a href="https://colab.research.google.com/github/DKhanh/MLBasicModels/blob/master/LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

DATASET_SIZE = 10000
FEATURE_SIZE = 5

TRAINING_SIZE = int(DATASET_SIZE*0.6)
CV_SIZE = int(DATASET_SIZE*0.2)
TEST_SIZE = DATASET_SIZE - TRAINING_SIZE - CV_SIZE

theta = np.random.uniform(-1, 1, size=(FEATURE_SIZE, 1))
learning_rate = 0.005
number_of_epoch = 500000
cost_threshold = 0.01

## **LOGISTIC REGRESSION IMPLEMENTATION**

1.   Input equation that ML model will try to learn



In [17]:
def inputEquation(x):
  linearPart = 18*(x[0]**4)- 5*(x[1]*x[0]) + 180895*(x[2]) + (x[3]*x[1]*x[2]**2)+ 2020*(x[4])
  y = 1/(1+np.exp((-1)*linearPart))
  return y

2. Generate dataset


In [18]:
x0 = np.random.uniform(-1, 4, (DATASET_SIZE, 1))
x1 = np.random.uniform(-4, 1, (DATASET_SIZE, 1))
x2 = np.random.uniform(-3, 2, (DATASET_SIZE, 1))
x3 = np.random.uniform(-1, 0, (DATASET_SIZE, 1))
x4 = np.random.uniform(0, 1, (DATASET_SIZE, 1))

x = np.concatenate((x0, x1, x2, x3, x4), axis=1)
y = [inputEquation(row) for row in x]
y = np.array(y).reshape(DATASET_SIZE, 1)
# print(x.shape)
# print(y.shape)

  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
x_train = x[0:TRAINING_SIZE]
y_train = y[0:TRAINING_SIZE]

x_cv = x[TRAINING_SIZE:TRAINING_SIZE+CV_SIZE]
y_cv = y[TRAINING_SIZE:TRAINING_SIZE+CV_SIZE]

x_test = x[TRAINING_SIZE+CV_SIZE:]
y_test = y[TRAINING_SIZE+CV_SIZE:]
# print(x_test.shape[0])

3. Hypothesis model

In [21]:
def logisticRegressionHypothesis(input_dataset, theta):
  linearPart = np.dot(input_dataset, theta).reshape(input_dataset.shape[0], 1)
  predicted_output = 1/(1 + np.exp((-1)*linearPart))
  return predicted_output

# logisticRegressionHypothesis(x_test, theta)

array([[0.77727793],
       [0.27354083],
       [0.63627068],
       ...,
       [0.31282325],
       [0.01053806],
       [0.22226149]])

In [32]:
def costCalculation(input_dataset, output_dataset, theta):
  predicted_output = logisticRegressionHypothesis(input_dataset, theta)
  cost_y_true = output_dataset*np.log(predicted_output)
  cost_y_false = (1 - output_dataset)*np.log(1-predicted_output)

  cost = np.sum(cost_y_true + cost_y_false)*(-1)/(input_dataset.shape[0])
  return cost

costCalculation(x_train, y_train, theta)

1.215292001812945

In [33]:
def logisticOptimization(input_dataset, output_dataset, theta):
  predicted_output = logisticRegressionHypothesis(input_dataset, theta)
  diff = predicted_output - output_dataset
 
  # derivative = np.sum((diff*input_dataset)/input_dataset.shape[0]), axis=0).reshape(theta.shape[0], 1)
  derivative = np.dot(np.transpose(input_dataset), diff/input_dataset.shape[0])
  theta = theta - learning_rate*derivative

  return np.array(theta).reshape(len(theta),1)

logisticOptimization(x_train, y_train, theta)

array([[-0.85614655],
       [ 0.49200125],
       [-0.23700847],
       [-0.7549485 ],
       [-0.83005222]])

In [39]:
for epoch in range(30):
  cost = costCalculation(x_cv, y_cv, theta)
  theta = logisticOptimization(x_train, y_train, theta)
  
  print(cost)
  if (cost < cost_threshold):
    print(cost)
    break
  
print(theta)

0.025696202510854534
0.02569618428909816
0.025696166067392695
0.025696147845738124
0.025696129624134458
0.0256961114025817
0.025696093181079845
0.025696074959628893
0.02569605673822884
0.025696038516879693
0.025696020295581448
0.025696002074334107
0.025695983853137663
0.02569596563199213
0.02569594741089749
0.025695929189853757
0.025695910968860924
0.025695892747918996
0.02569587452702796
0.02569585630618783
0.0256958380853986
0.025695819864660272
0.025695801643972827
0.025695783423336303
0.025695765202750673
0.025695746982215932
0.0256957287617321
0.025695710541299162
0.025695692320917125
0.025695674100585982
[[ 0.17907519]
 [ 0.09242666]
 [13.41497474]
 [-0.11926347]
 [-0.02770275]]


In [40]:
costCalculation(x_test, y_test, theta)

0.02429807854383003