In [1]:
from pathlib import Path
import numpy as np
from scipy.cluster.vq import whiten, kmeans, vq

In [2]:
data = Path("SMSSpamCollection").read_text()
data = data.strip()
data = data.split("\n")

In [3]:
digit_counts = np.empty((len(data), 2), dtype=int)

In [4]:
# loop through data and counting digits in message 
# label data 0 if non spam 1 if spam.
for i, line in enumerate(data):
    case, message = line.split("\t")
    num_digits = sum(c.isdigit() for c in message)
    # digit_counts[i, y]:   i = index of  array, y = index of nested
    digit_counts[i, 0] = 0 if case == "ham" else 1
    digit_counts[i, 1] = num_digits

In [62]:
# count unique nested arrays, return count -similar to pandas.groupby with agg-
unique_counts = np.unique(digit_counts[:, 1], return_counts=True)
unique_counts[:5]

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 40, 41, 47]),
 array([4110,  486,  160,   78,   42,   39,   16,   14,   28,   17,   16,
          34,   30,   31,   37,   29,   35,   33,   41,   47,   18,   31,
          28,   36,   34,   16,   16,   13,   19,    9,    2,    6,    3,
           4,    3,    4,    1,    1,    4,    2,    1], dtype=int64))

In [63]:
# traspose into DF-like format
# index: number of digits in text
# columns: number of messages with that many digits in it
unique_counts = np.transpose(np.vstack(unique_counts))
unique_counts[:5]

array([[   0, 4110],
       [   1,  486],
       [   2,  160],
       [   3,   78],
       [   4,   42]], dtype=int64)

In [22]:
whitened_counts = whiten(unique_counts)
codebook, _ = kmeans(whitened_counts, 3)

In [66]:
whitened_counts[:10]

array([[0.        , 6.49364346],
       [0.08117555, 0.76786149],
       [0.16235109, 0.25279391],
       [0.24352664, 0.12323703],
       [0.32470219, 0.0663584 ],
       [0.40587773, 0.06161851],
       [0.48705328, 0.02527939],
       [0.56822883, 0.02211947],
       [0.64940437, 0.04423893],
       [0.73057992, 0.02685935]])

In [31]:
codebook[:5]

array([[0.85234324, 0.09724666],
       [0.        , 6.49364346],
       [2.52050073, 0.01840656]])

In [32]:
codes, _ = vq(whitened_counts, codebook)

In [33]:
ham_code = codes[0]
spam_code = codes[-1]
unknown_code = list(set(range(3)) ^ set((ham_code, spam_code)))[0]

In [38]:
print("definitely ham:", unique_counts[codes == ham_code][-1])
print("definitely spam:", unique_counts[codes == spam_code][-1])
print("unknown:", unique_counts[codes == unknown_code][-1])

definitely ham: [   0 4110]
definitely spam: [47  1]
unknown: [20 18]


In [39]:
digits = digit_counts[:, 1]
predicted_hams = digits == 0
predicted_spams = digits > 20
predicted_unknowns = np.logical_and(digits > 0, digits <= 20)

In [40]:
spam_cluster = digit_counts[predicted_spams]
ham_cluster = digit_counts[predicted_hams]
unk_cluster = digit_counts[predicted_unknowns]

In [41]:
print("hams:", np.unique(ham_cluster[:, 0], return_counts=True))
print("spams:", np.unique(spam_cluster[:, 0], return_counts=True))
print("unknowns:", np.unique(unk_cluster[:, 0], return_counts=True))

hams: (array([0, 1]), array([4071,   39], dtype=int64))
spams: (array([0, 1]), array([  1, 232], dtype=int64))
unknowns: (array([0, 1]), array([755, 476], dtype=int64))


In [46]:
import seaborn as sns

# Using the Optimize Module in SciPy

## Minimizing a Function With One Variable
##### A mathematical function that accepts one number and results in one output is called a scalar function. It’s usually contrasted with multivariate functions that accept multiple numbers and also result in multiple numbers of output. You’ll see an example of optimizing multivariate functions in the next section.

##### For this section, your scalar function will be a quartic polynomial, and your objective is to find the minimum value of the function. The function is y = 3x⁴ - 2x + 1. The function is plotted in the image below for a range of x from 0 to 1:

In [43]:
from scipy.optimize import minimize_scalar

def objective_function(x):
    return 3 * x ** 4 - 2 * x + 1

In [44]:
res = minimize_scalar(objective_function)

In [45]:
res

     fun: 0.17451818777634331
    nfev: 16
     nit: 12
 success: True
       x: 0.5503212087491959