In [1]:
%load_ext autoreload
%autoreload 2

In [11]:
import src.dataset_analyzer.grasp as grasp
from src.dataset_analyzer.grasp.grasp import GrASP
from sklearn.model_selection import train_test_split

## Load the data
- We use the SMS Spam Collection dataset from the following paper. Please download and unzip it by running the two cells below **only if you have not done this before**.

```
Almeida, T. A., Hidalgo, J. M. G., & Yamakami, A. (2011, September). Contributions to the study of SMS spam filtering: new collection and results. In Proceedings of the 11th ACM symposium on Document engineering (pp. 259-262).
```

In [5]:
import urllib.request
url = 'http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/smsspamcollection.zip'
filename = './data/smsspamcollection.zip'
urllib.request.urlretrieve(url, filename)

('./data/smsspamcollection.zip', <http.client.HTTPMessage at 0x7fbc8f96f370>)

In [6]:
!unzip ./data/smsspamcollection.zip -d ./data

Archive:  ./data/smsspamcollection.zip
  inflating: ./data/readme           
  inflating: ./data/SMSSpamCollection.txt  


- Load the data

In [3]:
def get_data():
    f = open('data/SMSSpamCollection.txt', 'r')
    texts, labels = [], []
    for line in f:
        line = line.strip()
        tab_idx = line.index('\t')
        label = line[:tab_idx]
        text = line[tab_idx+1:]
        if label == 'ham':
            label = 0
        elif label == 'spam':
            label = 1
        else:
            raise Exception(f"Invalid label - {label}")
        texts.append(text)
        labels.append(label)
    return texts, labels

In [4]:
texts, labels = get_data()
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
# len(texts), sum(labels), len(X_test), sum(y_test)

In [5]:
positive = [t for idx, t in enumerate(X_train) if y_train[idx]]
negative = [t for idx, t in enumerate(X_train) if not y_train[idx]]
print(f'Positive examples = {len(positive)}\nNegative examples = {len(negative)}')

Positive examples = 488
Negative examples = 3079


In [6]:
negative[0]

"Great. I was getting worried about you. Just know that a wonderful and caring person like you will have only the best in life. Know that u r wonderful and God's love is yours."

## Run GrASP

In [7]:
# Create the GrASP engine
grasp_model = GrASP(include_standard = ['TEXT', 'POS', 'HYPERNYM', 'SENTIMENT'],
                    num_patterns = 100, gaps_allowed = 2)

In [8]:
# Fit GrASP to the dataset
the_patterns = grasp_model.fit_transform(positive[:100], negative[:100])

Step 1: Create augmented texts


100%|██████████| 100/100 [00:05<00:00, 17.20it/s]
100%|██████████| 100/100 [00:02<00:00, 43.34it/s]


Step 2: Find frequent attributes
Total number of candidate alphabet = 3395, such as ['SPACY:POS-NOUN', 'SPACY:POS-VERB', 'SPACY:POS-PUNCT', 'SPACY:POS-PROPN', 'SPACY:POS-PRON']
Step 3: Find alphabet set


100%|██████████| 3395/3395 [00:04<00:00, 833.75it/s]


Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100
Finding top k: 100 / 100
Total number of alphabet = 100
Step 4: Grow patterns


100%|██████████| 100/100 [00:04<00:00, 24.54it/s]


Length 2 / 5; New candidates = 14950
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100
Finding top k: 100 / 100


100%|██████████| 71/71 [00:05<00:00, 12.36it/s]


Length 3 / 5; New candidates = 13891
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100
Finding top k: 100 / 100


100%|██████████| 33/33 [00:03<00:00,  9.33it/s]


Length 4 / 5; New candidates = 6541
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100
Finding top k: 100 / 100


100%|██████████| 5/5 [00:00<00:00,  6.46it/s]


Length 5 / 5; New candidates = 991
Finding top k: 10 / 100
Finding top k: 20 / 100
Finding top k: 30 / 100
Finding top k: 40 / 100
Finding top k: 50 / 100
Finding top k: 60 / 100
Finding top k: 70 / 100
Finding top k: 80 / 100
Finding top k: 90 / 100
Finding top k: 100 / 100


In [None]:
# Print the learned patterns
for idx, p in enumerate(the_patterns):
    print(f'Rank {idx+1}')
    print(p)

In [20]:
print(f'  #    class Cov(%)    Prec    Gain    Pattern')
for idx, p in enumerate(the_patterns):
    print(f'{idx+1:>3} {p.support_class}   {round(p.coverage*100, 1):>4}   {p.precision:.3f}   {p.metric:.3f}    {p.get_pattern_id()}')

  #    class Cov(%)    Prec    Gain    Pattern
  1 Positive   53.0   0.858   0.474    [['SPACY:POS-NUM']]
  2 Positive   34.5   0.971   0.401    [['SPACY:POS-PROPN'], ['SPACY:POS-NUM']]
  3 Positive   30.5   0.934   0.273    [['SPACY:POS-NUM'], ['SPACY:POS-PROPN']]
  4 Positive   47.5   0.789   0.232    [['SPACY:POS-PROPN'], ['SPACY:POS-PROPN']]
  5 Positive   19.5   1.000   0.229    [['TEXT:call'], ['SPACY:POS-NUM']]
  6 Positive   26.5   0.925   0.213    [['SPACY:POS-PROPN'], ['SPACY:POS-PROPN'], ['SPACY:POS-PROPN']]
  7 Positive   23.0   0.957   0.212    [['SPACY:POS-SYM']]
  8 Positive   36.5   0.836   0.200    [['SPACY:POS-NUM'], ['SPACY:POS-NOUN']]
  9 Positive   19.5   0.974   0.192    [['TEXT:.'], ['SPACY:POS-NUM']]
 10 Positive   46.0   0.772   0.190    [['SPACY:POS-NOUN'], ['SPACY:POS-PROPN']]
 11 Positive   18.5   0.973   0.179    [['SPACY:POS-NUM'], ['TEXT:.']]
 12 Positive   21.5   0.930   0.168    [['SPACY:POS-NUM'], ['SPACY:POS-ADP']]
 13 Positive   17.5   0.971   0.166 

## Post-process the patterns

In [9]:
# Select only patterns of which precision is greater than 0.70
selected_patterns = [p for p in the_patterns if p.precision >= 0.70]
print(f'No. of remaining patterns = {len(selected_patterns)}')

No. of remaining patterns = 96


In [12]:
# For every pair of patterns (p1, p2), remove pattern p2 if there exists p1 in the patterns set such that p2 is a specialization of p1 and metric of p2 is lower than p1
selected_patterns = grasp.utils.remove_specialized_patterns(selected_patterns, metric = lambda x: x.precision)
print(f'No. of remaining patterns = {len(selected_patterns)}')

No. of remaining patterns = 85


In [13]:
# Print the remaining patterns sorted by precision
selected_patterns = sorted(selected_patterns, key = lambda x: x.precision, reverse = True)
print(f'  #    class Cov(%)    Prec  Gain    Pattern')
for idx, p in enumerate(selected_patterns):
    print(f'{idx+1:>3} {p.support_class}   {round(p.coverage*100, 1):>4}   {p.precision:.3f}   {p.metric:.3f}    {p.get_pattern_id()}')

  #    class Cov(%)    Prec  Gain    Pattern
  1 Positive   19.5   1.000   0.229    [['TEXT:call'], ['SPACY:POS-NUM']]
  2 Positive   14.5   1.000   0.163    [['SPACY:POS-PROPN'], ['SPACY:POS-ADP'], ['SPACY:POS-NUM']]
  3 Positive   14.5   1.000   0.163    [['SPACY:POS-PROPN'], ['SPACY:POS-NOUN'], ['SPACY:POS-NUM']]
  4 Positive   12.5   1.000   0.138    [['SPACY:POS-PROPN'], ['SPACY:POS-SYM']]
  5 Positive   12.0   1.000   0.132    [['SPACY:POS-PROPN'], ['SPACY:POS-NUM'], ['SPACY:POS-PROPN']]
  6 Positive   11.5   1.000   0.126    [['SPACY:POS-PROPN'], ['SPACY:POS-PROPN'], ['SPACY:POS-PROPN'], ['SPACY:POS-NUM']]
  7 Positive   11.0   1.000   0.120    [['SPACY:POS-PROPN'], ['SPACY:POS-PROPN'], ['SPACY:POS-PROPN'], ['SPACY:POS-NOUN']]
  8 Positive   10.5   1.000   0.114    [['SPACY:POS-SYM'], ['SPACY:POS-PROPN']]
  9 Positive   10.0   1.000   0.108    [['SENTIMENT:pos', 'SPACY:POS-PROPN']]
 10 Positive    9.5   1.000   0.102    [['SPACY:POS-PROPN'], ['SPACY:POS-DET']]
 11 Positive    9.

In [31]:
grasp.pattern2text(selected_patterns[1])

'A proper noun (a name of a specific individual, place, or object), closely followed by a preposition, and then by a number'

## Save the patterns to a json file
We can use this json file as an input of the web demo tool for exploring the learned patterns and the training data

In [13]:
grasp_model.to_json('results/case_study_1.json', patterns = selected_patterns, comment = 'Rank and group patterns based on precision. The minimum precision was set at 0.70')

100%|██████████| 47/47 [00:00<00:00, 380.03it/s]
100%|██████████| 47/47 [00:00<00:00, 362.51it/s]


Successfully dump the results to results/case_study_1.json
