In [27]:
import scipy

In [28]:
from pathlib import Path
import numpy as np
from scipy.cluster.vq import whiten, kmeans, vq

In [29]:
data = Path('SMSSpamCollection.txt').read_text()
data = data.strip()
data = data.split("\n")

In [31]:
len(data)

5574

In [32]:
digit_counts = np.empty((len(data), 2), dtype=int)

In [34]:
for i, line in enumerate(data):
    case, message = line.split("\t")
    num_digits = sum(c.isdigit() for c in message)
    digit_counts[i, 0] = 0 if case == "ham" else 1
    digit_counts[i, 1] = num_digits

In [35]:
unique_counts = np.unique(digit_counts[:, 1], return_counts=True)

In [37]:
unique_counts

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 40, 41, 47]),
 array([4110,  486,  160,   78,   42,   39,   16,   14,   28,   17,   16,
          34,   30,   31,   37,   29,   35,   33,   41,   47,   18,   31,
          28,   36,   34,   16,   16,   13,   19,    9,    2,    6,    3,
           4,    3,    4,    1,    1,    4,    2,    1], dtype=int64))

In [59]:
unique_counts = np.transpose(np.vstack(unique_counts))

In [65]:
whitened_counts = whiten(unique_counts)


In [66]:
codebook, _ = kmeans(whitened_counts, 3)

In [67]:
codes, _ = vq(whitened_counts, codebook)

In [68]:
ham_code = codes[0]
spam_code = codes[-1]
unknown_code = list(set(range(3)) ^ set((ham_code, spam_code)))[0]

In [69]:
print("definitely ham:", unique_counts[codes == ham_code][-1])
print("definitely spam:", unique_counts[codes == spam_code][-1])
print("unknown:", unique_counts[codes == unknown_code][-1])

definitely ham: [   0 4110]
definitely spam: [47  1]
unknown: [20 18]


In [70]:
digits = digit_counts[:, 1]
predicted_hams = digits == 0
predicted_spams = digits > 20
predicted_unknowns = np.logical_and(digits > 0, digits <= 20)

In [71]:
spam_cluster = digit_counts[predicted_spams]
ham_cluster = digit_counts[predicted_hams]
unk_cluster = digit_counts[predicted_unknowns]

In [72]:
print("hams:", np.unique(ham_cluster[:, 0], return_counts=True))
print("spams:", np.unique(spam_cluster[:, 0], return_counts=True))
print("unknowns:", np.unique(unk_cluster[:, 0], return_counts=True))

hams: (array([0, 1]), array([4071,   39], dtype=int64))
spams: (array([0, 1]), array([  1, 232], dtype=int64))
unknowns: (array([0, 1]), array([755, 476], dtype=int64))


In [73]:
from scipy.optimize import minimize_scalar

def objective_function(x):
    return 3 * x ** 4 - 2 * x + 1

In [75]:
res = minimize_scalar(objective_function)
res

     fun: 0.17451818777634331
    nfev: 16
     nit: 12
 success: True
       x: 0.5503212087491959

In [76]:
def objective_function(x):
    return x ** 4 - x ** 2

In [77]:
res = minimize_scalar(objective_function)
res

     fun: -0.24999999999999994
    nfev: 15
     nit: 11
 success: True
       x: 0.7071067853059209

In [78]:
res = minimize_scalar(objective_function, bracket=(-1, 0))

In [79]:
res

     fun: -0.24999999999999997
    nfev: 17
     nit: 13
 success: True
       x: 0.7071067809244586

In [81]:
res = minimize_scalar(objective_function, method='bounded', bounds=(-1, 0))
res

     fun: -0.24999999999998732
 message: 'Solution found.'
    nfev: 10
  status: 0
 success: True
       x: -0.707106701474177

In [96]:
def function(val):
    y = 4*((val-1)**2) + 5
    return y

In [97]:
function(5)

69

In [98]:
function(4)

41

In [99]:
function(-1)

21

In [100]:
function(-5)

149