# TTDS Lecture 18: Practical

Instructor: Björn Ross 17/11/2021

Created by Steve Wilson November 2020, modified by Björn Ross November 2021

## Let's build a text classifier!

### 1. Setup

In [1]:
import sklearn
print(sklearn.__version__)

1.0.1


In [2]:
# some prereqs:
import collections

# regular expressions
import re

# for string.punctuation: list of punctuation characters
import string

# import this for storing our BOW format
import scipy
from scipy import sparse

# scikit learn. Contains lots of ML models we can use
# import the library for support vector machines
from sklearn import svm
from sklearn import ensemble
from sklearn.metrics import classification_report

# numpy for more easily storing multidimensional data
import numpy as np

**Note:**
* Any package in the Python standard library (https://docs.python.org/3/library/) can be used in the coursework.
* Only use sklearn for the classification models! You are **not** allowed to use the `sklearn.feature_extraction` or `sklearn.preprocessing` components for the coursework.

### 2. Check the data format

In [3]:
# check out the data (use ! for command line operation)
!cat Tweets.14cat.train | head -5

45029314109075046	Furniture for - so cute! gotta show my #granddog mama the last one especially :) http://t.co/F69aT71TVQ http://t.co/YQVK09pZzB	Pets & Animals
45033090867215155	"#Sunday aww"": Mr Peebles	Pets & Animals
45036625162627481	CATS ART http://t.co/cJre1jn2Bl #creative #feline #art #love #cat #cats #kittens #housecat #domestic #alley #tomcat	Pets & Animals
45086603513077350	RT @Masala_chaai: Keep Calm & Hug your Dog ! #PetLovers cc @MyICETag @pooja330 @huftindia @PranitaBalar @BarknBond http://t.co/JJHSvf�	Pets & Animals
45138968053405286	RT @TheSoulfulEMU: RETWEET if you love your dog!! http://t.co/QWvjFFnfiP via @earthposts @LUKIKA 	Pets & Animals
cat: stdout: Broken pipe


### 3. Load and preprocess

In [4]:
# load our data
training_data = open('Tweets.14cat.train',encoding="latin-1").read()
test_data     = open('Tweets.14cat.test',encoding="latin-1").read()
# we will save the testing data for later...

In [5]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
# example of how the tokenization part will work
# q: what important features might this remove?
# a: Emoticons
invalid_chars = re.compile(f'[{string.punctuation}]')
invalid_chars.sub('',"Hello, World! #Tweets").lower().split()

['hello', 'world', 'tweets']

In [7]:
training_data

'45029314109075046\tFurniture for - so cute! gotta show my #granddog mama the last one especially :) http://t.co/F69aT71TVQ http://t.co/YQVK09pZzB\tPets & Animals\n45033090867215155\t"#Sunday aww"": Mr Peebles\tPets & Animals\n45036625162627481\tCATS ART http://t.co/cJre1jn2Bl #creative #feline #art #love #cat #cats #kittens #housecat #domestic #alley #tomcat\tPets & Animals\n45086603513077350\tRT @Masala_chaai: Keep Calm & Hug your Dog ! #PetLovers cc @MyICETag @pooja330 @huftindia @PranitaBalar @BarknBond http://t.co/JJHSvf\x85\tPets & Animals\n45138968053405286\tRT @TheSoulfulEMU: RETWEET if you love your dog!! http://t.co/QWvjFFnfiP via @earthposts @LUKIKA \tPets & Animals\n45171179411842662\tMissing cat Atlantic Gardens http://t.co/e2mu2yiv6H #southjersey #petnews #petadoption #pettips #cats #dogs #petadoption\tPets & Animals\n45214142046457446\tRT @Doggy_Stylin: First-time customers receive a $10 off one full-service day grooming http://t.co/4RF1WtkpXr #dog #grooming #puppies\tPe

In [8]:
# convert to list of lists: documents containing tokens
# and return the list of categories
# also get the vocabulary
def preprocess_data(data):
    
    chars_to_remove = re.compile(f'[{string.punctuation}]')
    
    documents = []
    categories = []
    vocab = set([])
    
    lines = data.split('\n')
    
    for line in lines:
        # make a dictionary for each document
        # word_id -> count (could also be tf-idf score, etc.)
        line = line.strip()
        if line:
            # split on tabs, we have 3 columns in this tsv format file
            tweet_id, tweet, category = line.split('\t')

            # process the words
            words = chars_to_remove.sub('',tweet).lower().split()
            for word in words:
                vocab.add(word)
            # add the list of words to the documents list
            documents.append(words)
            # add the category to the categories list
            categories.append(category)
            
    return documents, categories, vocab

In [9]:
%time
# ^ see how long this takes
# preprocess the data
preprocessed_training_data, training_categories, train_vocab = preprocess_data(training_data)
preprocessed_test_data, test_categories, test_vocab = preprocess_data(test_data)

print(f"Training Data has {len(preprocessed_training_data)} " +
      f"documents and vocab size of {len(train_vocab)}")
print(f"Test Data has {len(preprocessed_test_data)} " +
      f"documents and vocab size of {len(test_vocab)}")
print(f"There were {len(set(training_categories))} " +
      f"categories in the training data and {len(set(test_categories))} in the test.")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs
Training Data has 2503 documents and vocab size of 12726
Test Data has 625 documents and vocab size of 4365
There were 14 categories in the training data and 14 in the test.


In [1]:
# preprocessed_training_data

In [11]:
# check the most common categories in the training data
print(collections.Counter(training_categories).most_common())

[('Gaming', 220), ('Autos & Vehicles', 210), ('Howto & Style', 207), ('Sports', 203), ('Travel & Events', 196), ('Science & Technology', 189), ('Film & Animation', 178), ('Pets & Animals', 177), ('News & Politics', 168), ('Music', 160), ('Entertainment', 159), ('Comedy', 153), ('Education', 142), ('Nonprofits & Activism', 141)]


### 4. Set up mappings for word and category IDs

In [14]:
# convert the vocab to a word id lookup dictionary
# anything not in this will be considered "out of vocabulary" OOV
word2id = {}
for word_id,word in enumerate(train_vocab):
    word2id[word] = word_id
    
# and do the same for the categories
cat2id = {}
for cat_id,cat in enumerate(set(training_categories)):
    cat2id[cat] = cat_id
    
print("The word id for dog is",word2id['dog'])
print("The category id for Pets & Animals is",cat2id['Pets & Animals'])

The word id for dog is 971
The category id for Pets & Animals is 5


### 5. Convert data to bag-of-words format

In [15]:
# build a BOW representation of the files: use the scipy 
# data is the preprocessed_data
# word2id maps words to their ids
def convert_to_bow_matrix(preprocessed_data, word2id):
    
    # matrix size is number of docs x vocab size + 1 (for OOV)
    matrix_size = (len(preprocessed_data),len(word2id)+1)
    oov_index = len(word2id)
    # matrix indexed by [doc_id, token_id]
    X = scipy.sparse.dok_matrix(matrix_size)

    # iterate through all documents in the dataset
    for doc_id,doc in enumerate(preprocessed_data):
        for word in doc:
            # default is 0, so just add to the count for this word in this doc
            # if the word is oov, increment the oov_index
            X[doc_id,word2id.get(word,oov_index)] += 1
    
    return X

In [16]:
%%time 
# generate X_train
X_train = convert_to_bow_matrix(preprocessed_training_data, word2id)

CPU times: user 931 ms, sys: 8.1 ms, total: 939 ms
Wall time: 952 ms


In [18]:
# check some docs
# print("First 3 documents are:",X_train[:3])

In [22]:
# training_categories
# word2id

In [19]:
# generate y_train

# these are the labels to predict
y_train = [cat2id[cat] for cat in training_categories]
# check the first 3 categories
print(y_train[:3])

[10, 10, 10]


### 6. Train an SVM model

In [23]:
# Let's train a model: now that the setup is done, it's a piece of cake!
%time
# instantiate a linear SVM classification model
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
# you can set various model hyperparamters here
model = sklearn.svm.LinearSVC(C=1000)
# then train the model!
model.fit(X_train,y_train)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.29 µs


LinearSVC(C=1000)

In [24]:
# make a prediction
sample_text = ['retweet','if','you','are','a','cat','person']
# create just a single vector as input (as a 1 x V matrix)
sample_x_in = scipy.sparse.dok_matrix((1,len(word2id)+1))
for word in sample_text:
    sample_x_in[0,word2id[word]] += 1

# what does the example document look like?
print(sample_x_in)
prediction = model.predict(sample_x_in)
# what category was predicted?
print("Prediction was:",prediction[0])
# what category was that?
print(cat2id)

  (0, 7791)	1.0
  (0, 3830)	1.0
  (0, 11856)	1.0
  (0, 9211)	1.0
  (0, 1604)	1.0
  (0, 3757)	1.0
  (0, 6156)	1.0
Prediction was: 10
{'Travel & Events': 0, 'Autos & Vehicles': 1, 'Sports': 2, 'Film & Animation': 3, 'Entertainment': 4, 'Howto & Style': 5, 'Science & Technology': 6, 'Education': 7, 'Comedy': 8, 'Music': 9, 'Pets & Animals': 10, 'News & Politics': 11, 'Nonprofits & Activism': 12, 'Gaming': 13}


### 7. Evaluating the model

In [25]:
# evaluate on training data: how well did we fit to the data we trained on?
y_train_predictions = model.predict(X_train)

# now can compute any metrics we care about. Let's quickly do accuracy
def compute_accuracy(predictions, true_values):
    num_correct = 0
    num_total = len(predictions)
    for predicted,true in zip(predictions,true_values):
        if predicted==true:
            num_correct += 1
    return num_correct / num_total

accuracy = compute_accuracy(y_train_predictions,y_train)
print("Accuracy:",accuracy)
# how did we do?

Accuracy: 1.0


Is that a good score? The score can be informative, but it isn't hard to do well on the training data.

### 8. Using the test set

In [26]:
# prepare test data in the same was as training data
X_test = convert_to_bow_matrix(preprocessed_test_data, word2id)
y_test = [cat2id[cat] for cat in test_categories]

In [27]:
# now evaluate on test data: data the model has NOT seen during training time
# make sure you do NOT update the model, only get predictions from it
y_test_predictions = model.predict(X_test)
accuracy = compute_accuracy(y_test_predictions,y_test)
print("Accuracy:",accuracy)

Accuracy: 0.632


In [30]:
cat2id.items()

dict_items([('Travel & Events', 0), ('Autos & Vehicles', 1), ('Sports', 2), ('Film & Animation', 3), ('Entertainment', 4), ('Howto & Style', 5), ('Science & Technology', 6), ('Education', 7), ('Comedy', 8), ('Music', 9), ('Pets & Animals', 10), ('News & Politics', 11), ('Nonprofits & Activism', 12), ('Gaming', 13)])

In [28]:
cat_names = []
for cat,cid in sorted(cat2id.items(),key=lambda x:x[1]):
    cat_names.append(cat)
print(classification_report(y_test, y_test_predictions, target_names=cat_names))

                       precision    recall  f1-score   support

      Travel & Events       0.62      0.61      0.62        54
     Autos & Vehicles       0.81      0.82      0.82        51
               Sports       0.68      0.51      0.58        53
     Film & Animation       0.58      0.48      0.52        46
        Entertainment       0.77      0.73      0.75        49
        Howto & Style       0.77      0.82      0.80        40
 Science & Technology       0.39      0.44      0.41        43
            Education       0.68      0.66      0.67        41
               Comedy       0.68      0.66      0.67        38
                Music       0.60      0.53      0.56        40
       Pets & Animals       0.73      0.82      0.77        45
      News & Politics       0.34      0.43      0.38        37
Nonprofits & Activism       0.70      0.55      0.62        38
               Gaming       0.57      0.72      0.64        50

             accuracy                           0.63 

In [31]:
# what would a simple baseline be? How about most common category from before (Gaming)?
# we should *definitely* be doing better than this! Otherwise the model is not helping at all
baseline_predictions = [cat2id['Gaming']] * len(y_test)
baseline_accuracy = compute_accuracy(baseline_predictions,y_train)
print("Accuracy:",baseline_accuracy)

Accuracy: 0.0848


In [32]:
# trying a different model...
# how about a random forest classifier?
%time
model = sklearn.ensemble.RandomForestClassifier()
model.fit(X_train,y_train)

y_train_predictions = model.predict(X_train)
print("Train accuracy was:",compute_accuracy(y_train_predictions,y_train))
y_test_predictions = model.predict(X_test)
print("Test accuracy was:",compute_accuracy(y_test_predictions,y_test))

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.25 µs
Train accuracy was: 1.0
Test accuracy was: 0.6272


In [33]:

cat_names = []
for cat,cid in sorted(cat2id.items(),key=lambda x:x[1]):
    cat_names.append(cat)
print(classification_report(y_test, y_test_predictions, target_names=cat_names))

                       precision    recall  f1-score   support

      Travel & Events       0.55      0.54      0.54        54
     Autos & Vehicles       0.88      0.86      0.87        51
               Sports       0.62      0.47      0.54        53
     Film & Animation       0.47      0.63      0.54        46
        Entertainment       0.90      0.73      0.81        49
        Howto & Style       0.79      0.75      0.77        40
 Science & Technology       0.38      0.30      0.34        43
            Education       0.70      0.63      0.67        41
               Comedy       0.71      0.53      0.61        38
                Music       0.56      0.55      0.56        40
       Pets & Animals       0.90      0.84      0.87        45
      News & Politics       0.29      0.70      0.41        37
Nonprofits & Activism       0.62      0.47      0.54        38
               Gaming       0.84      0.72      0.77        50

             accuracy                           0.63 

### 9. Other models to try?

check out all of the multiclass ready models! 
https://scikit-learn.org/stable/modules/multiclass.html