In [56]:
import os

import nltk
nltk.download('wordnet') # Needed because we will require Lemmatizer
nltk.download('reuters') # Needed because we will require reuters dataset
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,reuters
from nltk.stem import PorterStemmer
from nltk import ngrams

import string
import matplotlib.pyplot as plt

from collections import Counter
from pprint import pprint
from operator import itemgetter

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score,precision_score,recall_score



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rodhiambo2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\rodhiambo2\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [57]:
# Test to verify that the imports don't fail and that we have the correct files from nltk download
reuters.fileids()
stopwords.words('english')
word_tokenize('This is just a test')

['This', 'is', 'just', 'a', 'test']

### Binary Text Classification Problem

We will address the binary problem of detecting Sports related documents Vs any other type of documents. In order to do this we will create an artificial (and very small collection);

1. Define a set of labelled documents that will be our training dataset. These are the documents the classifier will learn from in order to categorise future unseen documents
2. Define a set of labelled documents that will be our testing dataset. These will be the "unseen" documents that the classifier will predic (without having being trained with them)
3. Represent our training and testing documents
4. Train the classifier based on the training data
5. Predict the labels for the testing documents

In [58]:
train_data = ['Football: a great sport','The referee has been very bad this season',
            'Our team scored 5 goals','I love tenis','Politics is in the increase in kenya',
            'exit means exit','The parliament wants to create new legislation',
             'I also want to travel the world']

In [59]:
train_data

['Football: a great sport',
 'The referee has been very bad this season',
 'Our team scored 5 goals',
 'I love tenis',
 'Politics is in the increase in kenya',
 'exit means exit',
 'The parliament wants to create new legislation',
 'I also want to travel the world']

In [60]:
train_labels = ["Sports","Sports","Sports","Sports","Non Sports","Non Sports","Non Sports",
                "Non Sports"]

In [61]:
train_labels

['Sports',
 'Sports',
 'Sports',
 'Sports',
 'Non Sports',
 'Non Sports',
 'Non Sports',
 'Non Sports']

In [62]:
test_data = ['Swimming is a great sport',
            'alot of policy changes wll happen after Exit',
            'The table tenis team will travel to the UK soon for the European Championship']

In [63]:
test_data

['Swimming is a great sport',
 'alot of policy changes wll happen after Exit',
 'The table tenis team will travel to the UK soon for the European Championship']

In [64]:
test_labels = ["Sports","Non Sports","Sports"]

In [65]:
test_labels

['Sports', 'Non Sports', 'Sports']

In [66]:
# Representation of the data using the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
vectorised_train_data = vectorizer.fit_transform(train_data)
vectorised_test_data = vectorizer.transform(test_data)

In [67]:
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [68]:
vectorised_train_data

<8x34 sparse matrix of type '<class 'numpy.float64'>'
	with 38 stored elements in Compressed Sparse Row format>

In [69]:
print(vectorised_train_data)

  (0, 5)	0.5773502691896258
  (0, 7)	0.5773502691896258
  (0, 23)	0.5773502691896258
  (1, 26)	0.23306022298531667
  (1, 20)	0.36755620220626206
  (1, 8)	0.36755620220626206
  (1, 2)	0.36755620220626206
  (1, 30)	0.36755620220626206
  (1, 1)	0.36755620220626206
  (1, 27)	0.36755620220626206
  (1, 22)	0.36755620220626206
  (2, 17)	0.5
  (2, 24)	0.5
  (2, 21)	0.5
  (2, 6)	0.5
  (3, 14)	0.7071067811865475
  (3, 25)	0.7071067811865475
  (4, 26)	0.21875176319808765
  (4, 19)	0.3449905190903309
  (4, 11)	0.3449905190903309
  (4, 9)	0.6899810381806618
  (4, 10)	0.3449905190903309
  (4, 12)	0.3449905190903309
  (5, 4)	0.894427190999916
  (5, 15)	0.447213595499958
  (6, 26)	0.2566384596871779
  (6, 18)	0.40474112817023666
  (6, 32)	0.40474112817023666
  (6, 28)	0.3392046533389727
  (6, 3)	0.40474112817023666
  (6, 16)	0.40474112817023666
  (6, 13)	0.40474112817023666
  (7, 26)	0.28065362276760947
  (7, 28)	0.37094601851668646
  (7, 0)	0.4426151249601719
  (7, 31)	0.4426151249601719
  (7, 29)	0.

In [70]:
# Create the model
classifier = LinearSVC()
classifier.fit(vectorised_train_data,train_labels)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [71]:
# print the predictions based on the test data
print(classifier.predict(vectorised_test_data))

['Sports' 'Non Sports' 'Non Sports']


### Common Issues

1. Matching problems (e.g. car is differnet from "Cars")
2. Cases the model has never seen before
3. "Spurious" correlations and bias ("car" appears only in the positive category)

### Addressing the Common Problems

In [72]:
## Function to show the feature weights of a document
from pprint import pprint

def feature_values(doc, representer):
    doc_representation = representer.transform([doc])
    features = representer.get_feature_names()
    return [(features[index],doc_representation[0, index]) for index in doc_representation.nonzero()[1]]

pprint([feature_values(doc, vectorizer) for doc in test_data])

[[('sport', 0.5773502691896258),
  ('is', 0.5773502691896258),
  ('great', 0.5773502691896258)],
 [('exit', 1.0)],
 [('travel', 0.36958797589810716),
  ('to', 0.30974356821489196),
  ('the', 0.7030455932328479),
  ('tenis', 0.36958797589810716),
  ('team', 0.36958797589810716)]]


In [75]:
stop_words = stopwords.words("english")
s_vectorizer = TfidfVectorizer(stop_words=stop_words)
s_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

In [77]:
s_vectorised_train_data = s_vectorizer.fit_transform(train_data)
s_vectorised_train_data

<8x25 sparse matrix of type '<class 'numpy.float64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [78]:
s_vectorised_test_data = s_vectorizer.fit_transform(test_data)
s_vectorised_test_data

<3x17 sparse matrix of type '<class 'numpy.float64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [79]:
classifier = LinearSVC()
classifier.fit(vectorised_train_data,train_labels)

print(classifier.predict(vectorised_test_data))

['Sports' 'Non Sports' 'Non Sports']
