# SUPER CHEAT NOTEBOOK

In [460]:
# Helper functions for the GigaCheat tool
def print_seperator():
    print('-'*100)

In [461]:
import pprint
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, log_loss

pp = pprint.PrettyPrinter(indent=4).pprint

## Logistic Regression

In [462]:
def sigmoid(_z):
    return 1 / (1 + np.exp(-_z))

def z(_x, _w, _b):
    return np.dot(_x, _w) + _b

def cross_entropy_loss (predicted_value, actual_value):
    return -actual_value * math.log(predicted_value) - (1 - actual_value) * np.log(1 - predicted_value)

def cross_entropy_loss_vector (_y_true_vector, _y_pred_vector):
    log_loss(_y_true_vector, _y_pred_vector, labels=[1,0])


def decision_function(val, _decision_boundary=0.5):
    return 1 if val >= _decision_boundary else 0

def calc_y_pred(_y_pred_prob, _decision_boundary=0.5):
    _y_pred = []
    for val in _y_pred_prob:
        _y_pred.append(decision_function(val))
    return _y_pred

def calc_pred_prob(_y_true, _X, _w, _b):
    _y_pred_prob = []
    for i in range(len(_X)):
        _y_pred_prob.append(sigmoid(z(_X[i], _w, _b)))
    return _y_pred_prob

def gradient_descent(_X, _y, _w, _b, _learning_rate=0.01, _num_iterations=100):
    for i in range(0, _num_iterations):
        _w_gradient = np.zeros(len(_w))
        _b_gradient = 0
        for j in range(len(_X)):
            _x = _X[j]
            _z = z(_x, _w, _b)
            for k in range(len(_w)):
                _w_gradient[k] = (_z - _y[j]) * _x[k]
            _b_gradient = _z - _y[j]

            _w = _w - _learning_rate * _w_gradient
            _b = _b - _learning_rate * _b_gradient  
    return _w, _b



In [464]:
# Input data
labels = [0,1]
y_true = [1,0,1, 1, 0, 0, 1,0,1, 1, 0, 0] # the actual labels

y_pred_prob = [] # = sigmoid of z-score of input vector x of X
y_pred = [] # = 1 if y_pred_prob >= decision_boundary, 0 otherwise
X = [[0.6, 0.9], [0.1, 0.2], [0.5, 0.9], [0.8, 0.5], [0.2, 0.1], [0.4, 0.2], [0.6, 0.9], [0.1, 0.2], [0.5, 0.9], [0.8, 0.5], [0.2, 0.1], [0.4, 0.2]] # observed input vectors
training_iterations = 10000 # number of iterations for gradient descent
w, b = gradient_descent(X, y_true, [.5,.5], 1, 0.01, training_iterations) # w and b are the weights and bias of the model

decision_boundary = 0.5 # decision boundary for the model

y_pred_prob = calc_pred_prob(y_true, X, w, b)
y_pred = calc_y_pred(y_pred_prob, decision_boundary)


print("labels =", labels, "\ny_true:", y_true)
print_seperator()
print("X =", X, "\nw =", w, "\nb =", b, "\ndecision_boundary", decision_boundary)
print_seperator()
print("y_pred_prob =", y_pred_prob, "\ny_pred =", y_pred)


labels = [0, 1] 
y_true: [1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0]
----------------------------------------------------------------------------------------------------
X = [[0.6, 0.9], [0.1, 0.2], [0.5, 0.9], [0.8, 0.5], [0.2, 0.1], [0.4, 0.2], [0.6, 0.9], [0.1, 0.2], [0.5, 0.9], [0.8, 0.5], [0.2, 0.1], [0.4, 0.2]] 
w = [0.9612927  0.94306953] 
b = -0.35715637887815543 
decision_boundary 0.5
----------------------------------------------------------------------------------------------------
y_pred_prob = [0.7442890600767808, 0.4819046056457873, 0.725568250154638, 0.7075288845602947, 0.48235960306812675, 0.5537847479248225, 0.7442890600767808, 0.4819046056457873, 0.725568250154638, 0.7075288845602947, 0.48235960306812675, 0.5537847479248225] 
y_pred = [1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1]


### Playground


#### Performance

In [465]:
# y_true = [1, 1, 0]
# y_pred = [1, 1, 1]

def print_confusion_matrix(y_true, y_pred):
    confusion_matrix_micro = confusion_matrix(y_true, y_pred)
    label_texts_index = ['True 0', 'True 1']
    label_texts_value = ['Pred 0', 'Pred 1']
    cmtx = pd.DataFrame(
        confusion_matrix(y_true, y_pred), 
        label_texts_index, 
        label_texts_value
    )
    print(cmtx)

def print_tp_fp_tn_fn(y_true, y_pred):
    tp, fp, fn, tn = confusion_matrix_micro.ravel()
    print("True positives:", tp, "True negatives:", tn, "False positives:", fp, "False negatives:", fn)

def print_metrics(y_true, y_pred):
    print_seperator()
    precision, recall, f1, accuracy = precision_score(y_true, y_pred), recall_score(y_true, y_pred), f1_score(y_true, y_pred), accuracy_score(y_true, y_pred)
    print("Precision:", precision, "Recall:", recall, "F1-Balanced:", f1, "Accuracy:", accuracy)
print("y_true:", y_true, "\ny_pred:", y_pred)
print_seperator()
print_confusion_matrix(y_true, y_pred)
print_tp_fp_tn_fn(y_true, y_pred)
print_metrics(y_true, y_pred)


y_true: [1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0] 
y_pred: [1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1]
----------------------------------------------------------------------------------------------------
        Pred 0  Pred 1
True 0       4       2
True 1       0       6
True positives: 1 True negatives: 3 False positives: 0 False negatives: 2
----------------------------------------------------------------------------------------------------
Precision: 0.75 Recall: 1.0 F1-Balanced: 0.8571428571428571 Accuracy: 0.8333333333333334


## Embeddings

In [466]:
import math
from sklearn.metrics import mutual_info_score

### Creating Vector Values

In [467]:
def pmi (_w_count, _c_count, _wc_count, _total):
    p_w = _w_count / _total
    p_c = _c_count / _total
    p_wc = _wc_count / _total
    return math.log2(p_wc / (p_w * p_c))

def ppmi (pmi): 
    if (pmi < 0):
        return 0
    else:
        return pmi

def tF (_num_t_in_doc):
    if (_num_t_in_doc < 0):
        return 0
    else: 
        return math.log10(1 + _num_t_in_doc)

def idF (_num_doc_with_t, _total_docs):
    return math.log(_total_docs/_num_doc_with_t)

def tFidF(_tF, _idF):
    return _tF * _idF

def centroid_document_vector(d):
    return np.mean(d, axis=0)


In [469]:
#pmi: compare two words by their co-occurence
x = [0,0,5,1]
y = [0,1,0,1]

w_count = 3 # count of word w
c_count = 4 # count of words in context c
wc_count = 2 # count of word w in context c
total = 19 # total number of words in corpus

# tf-idf: relevancy score of a word w in a document d
num_t_in_doc = 3 # number of times word w appears in document d
num_doc_with_t = 2 # number of documents in corpus with word w
total_docs = 5 # total number of documents in corpus


print("> PMI, PPMI")
print("x =", x, "\ny =", y)
print_seperator()
print("w_count =", w_count, "\nc_count =", c_count, "\nwc_count =", wc_count, "\ntotal =", total)
print("Mutual information score:", mutual_info_score(x, y))
print("PMI:", pmi(w_count, c_count, wc_count, total), "PPMI:", ppmi(pmi(w_count, c_count, wc_count, total)))
print_seperator()
print("> TF-IDF")
print("num_t_in_doc =", num_t_in_doc, "\nnum_doc_with_t =", num_doc_with_t, "\ntotal_docs =", total_docs)
print_seperator()
print("tF:", tF(num_t_in_doc), "idF:", idF(_num_doc_with_t, _total_docs), "tFidF:", tFidF(tF(num_t_in_doc), idF(num_doc_with_t, total_docs)))

> PMI, PPMI
x = [0, 0, 5, 1] 
y = [0, 1, 0, 1]
----------------------------------------------------------------------------------------------------
w_count = 3 
c_count = 4 
wc_count = 2 
total = 19
Mutual information score: 0.3465735902799726
PMI: 1.6629650127224294 PPMI: 1.6629650127224294
----------------------------------------------------------------------------------------------------
> TF-IDF
num_t_in_doc = 3 
num_doc_with_t = 2 
total_docs = 5
----------------------------------------------------------------------------------------------------
tF: 0.6020599913279624 idF: 0.9162907318741551 tFidF: 0.5516619900860461


### Comparing Vectors

In [470]:
def vector_length(v):
    return np.sqrt(np.dot(v, v))

def dot_product(v1, v2):
    return np.dot(v1, v2)

def cosine_similarity(v1, v2):
    return dot_product(v1, v2) / (vector_length(v1) * vector_length(v2))

In [471]:
v1 = [1,2,3]
v2 = [4,5,6]

dp = dot_product(v1, v2)
cs = cosine_similarity(v1, v2)

print("v1 =", v1,"length =", vector_length(v1), "\nv2 =", v2, "length =", vector_length(v1))
print_seperator()
print("dot_product =", dp, "\ncosine_similarity =", cs)

v1 = [1, 2, 3] length = 3.7416573867739413 
v2 = [4, 5, 6] length = 3.7416573867739413
----------------------------------------------------------------------------------------------------
dot_product = 32 
cosine_similarity = 0.9746318461970762
