Text Classification

<!-- Source: https://github.com/chseifert/tutorials/blob/master/nlp-ie/Text-Classification.ipynb -->
Used method: https://miguelmalvarez.com/2016/11/07/classifying-reuters-21578-collection-with-python/ 


Goal:
- Predicting whether an article belongs to a certain category using a multi-label classifier

In [26]:
import nltk
nltk.download('reuters')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\mitak\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mitak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mitak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mitak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
#import the corpus anjd other packages
from nltk.corpus import reuters
import nltk
import random
from nltk.stem.porter import *
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

In [28]:
#get reuters
from nltk.corpus import reuters
reuters.categories()
len(reuters.categories())

90

Preparation

In [29]:
# # Take in a list of raw strings, imported by [reuters.raw(file_id) for file_id in file_ids]
# # where the documents are also lists of strings.
# # and output a list of one string per document.
# def make_string_per_document(documents):
#     separated_text = []
#     for text in documents:
#         separated_text.append(text.splitlines())


#     #combine the text properly
#     combined_strings = []
#     for text in separated_text:
#         combined_strings.append("".join(text))
#     return combined_strings

In [30]:
documents = reuters.fileids()

In [34]:
def preprocessing_method1(documents):
    lower = []
    tokens = []
    porter_tokens = []
    lemmas = []
    
    lemmatizer = WordNetLemmatizer()

    for i in range(len(documents)):
        #lowercase first
        lower.append(documents[i].lower())
        #tokenize
        tokens.append(nltk.word_tokenize(lower[i]))
        #stemming
        # porter_tokens.append([stemmer.stem(word) for word in tokens[i]])
        #lemmatization
        lemmas.append([lemmatizer.lemmatize(word) for word in tokens[i]])
    
    combined_strings = []
    for doc in lemmas:
        # make one string per document
        combined_strings.append(" ".join(doc))
    
    
    
    return combined_strings


In [35]:
def preprocessing_method2(documents):
    lower = []
    tokens = []
    porter_tokens = []

    stemmer = PorterStemmer()

    for i in range(len(documents)):
        #lowercase first
        lower.append(documents[i].lower())
        #tokenize
        tokens.append(nltk.word_tokenize(lower[i]))
        #stemming
        porter_tokens.append([stemmer.stem(word) for word in tokens[i]])
    
    combined_strings = []
    for doc in porter_tokens:
        # make one string per document
        combined_strings.append(" ".join(doc))
    
    
    
    return combined_strings

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score


def train_method(pre_metho,documents):
    
    train_docs_id = list(filter(lambda doc: doc.startswith("train"),documents))
    test_docs_id = list(filter(lambda doc: doc.startswith("test"),documents))

    train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
    test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
    
    preprocessed_train_docs = pre_metho(train_docs)
    preprocessed_test_docs = pre_metho(test_docs)
    
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(preprocessed_train_docs)
    X_test = vectorizer.transform(preprocessed_test_docs)


    mlb = MultiLabelBinarizer()
    train_labels = mlb.fit_transform([reuters.categories(doc_id)
    for doc_id in train_docs_id])
    test_labels = mlb.transform([reuters.categories(doc_id)
    for doc_id in test_docs_id])
    
    classifier = OneVsRestClassifier(LinearSVC(random_state=42, max_iter=100000, dual='auto'))
    classifier.fit(X_train, train_labels)
    predictions = classifier.predict(X_test)

    print(classification_report(test_labels, predictions, target_names=mlb.classes_))
    print('Accuracy:', accuracy_score(test_labels, predictions))


In [36]:
train_method(preprocessing_method2,documents)
train_method(preprocessing_method1,documents)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

            acq       0.98      0.95      0.96       719
           alum       1.00      0.48      0.65        23
         barley       1.00      0.64      0.78        14
            bop       0.84      0.70      0.76        30
        carcass       0.77      0.56      0.65        18
     castor-oil       0.00      0.00      0.00         1
          cocoa       1.00      0.83      0.91        18
        coconut       1.00      0.50      0.67         2
    coconut-oil       1.00      0.33      0.50         3
         coffee       0.96      0.96      0.96        28
         copper       0.93      0.72      0.81        18
     copra-cake       0.00      0.00      0.00         1
           corn       0.91      0.86      0.88        56
         cotton       1.00      0.40      0.57        20
     cotton-oil       0.00      0.00      0.00         2
            cpi       0.58      0.50      0.54        28
            cpu       0.00    



                 precision    recall  f1-score   support

            acq       0.98      0.95      0.97       719
           alum       1.00      0.43      0.61        23
         barley       1.00      0.64      0.78        14
            bop       0.77      0.67      0.71        30
        carcass       0.85      0.61      0.71        18
     castor-oil       0.00      0.00      0.00         1
          cocoa       1.00      0.83      0.91        18
        coconut       1.00      0.50      0.67         2
    coconut-oil       0.00      0.00      0.00         3
         coffee       0.96      0.93      0.95        28
         copper       1.00      0.78      0.88        18
     copra-cake       0.00      0.00      0.00         1
           corn       0.92      0.86      0.89        56
         cotton       1.00      0.45      0.62        20
     cotton-oil       0.00      0.00      0.00         2
            cpi       0.62      0.46      0.53        28
            cpu       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
