# Classification 2.1 Reuters data set.

# Loading the needed libraries.

In [0]:
import pandas as pd
import numpy as np
from scipy import sparse

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from tqdm import tqdm_notebook

import os
import itertools

import warnings
warnings.filterwarnings('ignore')

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.multiclass import OneVsRestClassifier

from sklearn.decomposition import IncrementalPCA as iPCA

from sklearn.metrics import f1_score, precision_score, recall_score, brier_score_loss

from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.model_selection import train_test_split

In [0]:
import nltk
nltk.download("stopwords")
nltk.download("reuters")
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords
 
cachedStopWords = stopwords.words("english")

# Loading the data and taking the first look.

The Reuters Corpus contains 10,788 news documents totaling 1.3 million words. The documents have been classified into 90 topics, and grouped into two sets, called "training" and "test".
This split is for training and testing algorithms that automatically detect the topic of a document.

In [0]:
from nltk.corpus import reuters 

 
def collection_stats():
    
    # List of documents
    documents = reuters.fileids()
    print(str(len(documents)) + " documents")
 
    train_docs = list(filter(lambda doc: doc.startswith("train"),
                        documents))
    print(str(len(train_docs)) + " total train documents")
 
    test_docs = list(filter(lambda doc: doc.startswith("test"),
                       documents));
    print(str(len(test_docs)) + " total test documents")
 
    # List of categories
    categories = reuters.categories()
    print(str(len(categories)) + " categories\n")
 
    # Documents in a category
    category_docs = reuters.fileids("acq")
 
    # Words for a document
    document_id = category_docs[0]
    document_words = reuters.words(category_docs[0])
    print(document_words, "\n")  
 
    # Raw document
    print(reuters.raw(document_id))
    
    document_id = category_docs[1]
    document_words = reuters.words(category_docs[1])
    print(document_words, "\n")  
 
    # Raw document
    print(reuters.raw(document_id))

See readers API descriptions
https://www.nltk.org/api/nltk.corpus.reader.html#module-nltk.corpus.reader.api

https://www.nltk.org/api/nltk.corpus.reader.html?highlight=categorizedplaintextcorpusreader#nltk.corpus.reader.CategorizedPlaintextCorpusReader

In [0]:
collection_stats()

10788 documents
7769 total train documents
3019 total test documents
90 categories

([u'SUMITOMO', u'BANK', u'AIMS', u'AT', u'QUICK', ...], '\n')
SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERGER
  Sumitomo Bank Ltd &lt;SUMI.T> is certain to
  lose its status as Japan's most profitable bank as a result of
  its merger with the Heiwa Sogo Bank, financial analysts said.
      Osaka-based Sumitomo, with desposits of around 23.9
  trillion yen, merged with Heiwa Sogo, a small, struggling bank
  with an estimated 1.29 billion dlrs in unrecoverable loans, in
  October.
      But despite the link-up, Sumitomo President Koh Komatsu
  told Reuters he is confident his bank can quickly regain its
  position.
      "We'll be back in position in first place within three
  years," Komatsu said in an interview.
      He said that while the merger will initially reduce
  Sumitomo's profitability and efficiency, it will vastly expand
  Sumitomo's branch network in the Tokyo metropolitan area where
  it 

In [0]:
reuters.categories()[:20]

[u'acq',
 u'alum',
 u'barley',
 u'bop',
 u'carcass',
 u'castor-oil',
 u'cocoa',
 u'coconut',
 u'coconut-oil',
 u'coffee',
 u'copper',
 u'copra-cake',
 u'corn',
 u'cotton',
 u'cotton-oil',
 u'cpi',
 u'cpu',
 u'crude',
 u'dfl',
 u'dlr']

#### print ids of documents in category 'barley'

In [0]:
reuters.fileids('barley')

[u'test/15618',
 u'test/15649',
 u'test/15676',
 u'test/15728',
 u'test/15871',
 u'test/15875',
 u'test/15952',
 u'test/17767',
 u'test/17769',
 u'test/18024',
 u'test/18263',
 u'test/18908',
 u'test/19275',
 u'test/19668',
 u'training/10175',
 u'training/1067',
 u'training/11208',
 u'training/11316',
 u'training/11885',
 u'training/12428',
 u'training/13099',
 u'training/13744',
 u'training/13795',
 u'training/13852',
 u'training/13856',
 u'training/1652',
 u'training/1970',
 u'training/2044',
 u'training/2171',
 u'training/2172',
 u'training/2191',
 u'training/2217',
 u'training/2232',
 u'training/3132',
 u'training/3324',
 u'training/395',
 u'training/4280',
 u'training/4296',
 u'training/5',
 u'training/501',
 u'training/5467',
 u'training/5610',
 u'training/5640',
 u'training/6626',
 u'training/7205',
 u'training/7579',
 u'training/8213',
 u'training/8257',
 u'training/8759',
 u'training/9865',
 u'training/9958']

#### print categories of 'training/9865', 'training/9880' documents

In [0]:
reuters.categories(['training/9865','training/9880'])

[u'barley', u'corn', u'grain', u'money-fx', u'wheat']

#### calculate number of documents in each category

In [0]:
for i in reuters.categories():
  print (i, len(reuters.fileids(i)))

(u'acq', 2369)
(u'alum', 58)
(u'barley', 51)
(u'bop', 105)
(u'carcass', 68)
(u'castor-oil', 2)
(u'cocoa', 73)
(u'coconut', 6)
(u'coconut-oil', 7)
(u'coffee', 139)
(u'copper', 65)
(u'copra-cake', 3)
(u'corn', 237)
(u'cotton', 59)
(u'cotton-oil', 3)
(u'cpi', 97)
(u'cpu', 4)
(u'crude', 578)
(u'dfl', 3)
(u'dlr', 175)
(u'dmk', 14)
(u'earn', 3964)
(u'fuel', 23)
(u'gas', 54)
(u'gnp', 136)
(u'gold', 124)
(u'grain', 582)
(u'groundnut', 9)
(u'groundnut-oil', 2)
(u'heat', 19)
(u'hog', 22)
(u'housing', 20)
(u'income', 16)
(u'instal-debt', 6)
(u'interest', 478)
(u'ipi', 53)
(u'iron-steel', 54)
(u'jet', 5)
(u'jobs', 67)
(u'l-cattle', 8)
(u'lead', 29)
(u'lei', 15)
(u'lin-oil', 2)
(u'livestock', 99)
(u'lumber', 16)
(u'meal-feed', 49)
(u'money-fx', 717)
(u'money-supply', 174)
(u'naphtha', 6)
(u'nat-gas', 105)
(u'nickel', 9)
(u'nkr', 3)
(u'nzdlr', 4)
(u'oat', 14)
(u'oilseed', 171)
(u'orange', 27)
(u'palladium', 3)
(u'palm-oil', 40)
(u'palmkernel', 3)
(u'pet-chem', 32)
(u'platinum', 12)
(u'potato', 6)
(u

# Preprocessing

In [0]:
def tokenize(text):
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))
    words = [word for word in words
                  if word not in cachedStopWords]
    tokens = (list(map(lambda token: PorterStemmer().stem(token),
                  words)));
    p = re.compile('[a-zA-Z]+');
    filtered_tokens = list(filter(lambda token:
                  p.match(token) and len(token)>=min_length,
         tokens))
    return filtered_tokens

стоп слова для каждого языка могут быть свои.

PorterStamemer - выделение основы слова исходя из грамматики.(для русского - snowball?).

а так, смысл функции - выделяем слова, филтруем стоп-слова, приводим к нижнему регистру и возвращаем преобразованный.

stemmers https://pythonspot.com/nltk-stemming/

bag of words!!!

In [0]:
# Return the representer, without transforming
def tf_idf(docs):
    tfidf = TfidfVectorizer(tokenizer=tokenize, min_df=3,
                        max_df=0.90, max_features=3000,
                        use_idf=True, sublinear_tf=True,
                        norm='l2')
    tfidf.fit(docs)
    return tfidf

TfIdf Sklearn API

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [0]:
def feature_values(doc, representer):
    doc_representation = representer.transform([doc])
    features = representer.get_feature_names()
    return [(features[index], doc_representation[0, index])
                 for index in doc_representation.nonzero()[1]]

какие слова и с каким весом были посчитаны для tfdf.

In [0]:
def main():
    train_docs = []
    test_docs = []
 
    for doc_id in reuters.fileids():
        if doc_id.startswith("train"):
            train_docs.append(reuters.raw(doc_id))
        else:
            test_docs.append(reuters.raw(doc_id))
 
    representer = tf_idf(train_docs)
 
    for doc in test_docs:
        print(feature_values(doc, representer))

In [0]:
%%time
main()

[(u'yesterday', 0.04033607332946197), (u'year', 0.04868306823970166), (u'yasuhiro', 0.06784216189089363), (u'would', 0.06372602853755906), (u'worri', 0.06369058131200922), (u'world', 0.06538908930353625), (u'work', 0.04242957850539387), (u'whose', 0.06074498801433533), (u'whole', 0.05831497105507654), (u'week', 0.03350388061036944), (u'washington', 0.05100373242934736), (u'warn', 0.05588698545333975), (u'want', 0.04437615671705097), (u'virtual', 0.06526893899506282), (u'view', 0.0881170846232083), (u'u.s.-japan', 0.0756981315596951), (u'u.s.', 0.10359584116941481), (u'two', 0.048942514648738625), (u'trade', 0.11348195468326348), (u'tough', 0.06585418728418185), (u'told', 0.032587432290743255), (u'tokyo', 0.09114601823204509), (u'time', 0.03724208529585703), (u'threat', 0.06130874824193495), (u'third', 0.04526371139204408), (u'textil', 0.06275911603898573), (u'tax', 0.03718698709533435), (u'tariff', 0.1358039419071181), (u'talk', 0.040716751283530396), (u'taiwan', 0.13419112859999516), 

In [0]:
# List of document ids
documents = reuters.fileids()
 
train_docs_id = list(filter(lambda doc: doc.startswith("train"),
                            documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"),
                           documents))
 
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

In [0]:
%%time
# Tokenisation
vectorizer = TfidfVectorizer(stop_words=cachedStopWords,
                             tokenizer=tokenize)
 
# Learn and transform train documents
vectorised_train_documents = vectorizer.fit_transform(train_docs)
vectorised_test_documents = vectorizer.transform(test_docs)
 
# Transform multilabel labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id)
                                  for doc_id in train_docs_id])
test_labels = mlb.transform([reuters.categories(doc_id)
                             for doc_id in test_docs_id])

CPU times: user 54.9 s, sys: 13 ms, total: 54.9 s
Wall time: 54.9 s


MultiLabelBinarizer API
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html

# Incremental PCA

In [0]:
%%time
n = vectorised_train_documents.shape[0] #how many rows we have in the dataset

n_components = 2400
chunk_size = n_components + 100

ipca = iPCA(n_components=n_components) 

vectorised_train_documents_arr = vectorised_train_documents.toarray()
vectorised_test_documents_arr = vectorised_test_documents.toarray()
print(vectorised_train_documents_arr.shape)

for i in range(0, n // chunk_size):
    ipca.partial_fit(vectorised_train_documents_arr[i * chunk_size : (i + 1) * chunk_size])

ipca.partial_fit(vectorised_train_documents_arr[(i + 1) * chunk_size:])
    
print(np.sum(ipca.explained_variance_ratio_))

(7769, 20684)
0.8637992232769668
CPU times: user 20min 53s, sys: 44.6 s, total: 21min 37s
Wall time: 11min 8s


In [0]:
%%time
vectorised_train_ipca = ipca.transform(vectorised_train_documents_arr)
vectorised_test_ipca = ipca.transform(vectorised_test_documents_arr)

print(np.shape(vectorised_train_ipca), np.shape(vectorised_train_documents))

((7769, 2400), (7769, 20684))
CPU times: user 1min 9s, sys: 502 ms, total: 1min 9s
Wall time: 35.4 s


# Modeling and scoring

## Helper functions

In [0]:
def classifier_f(clf, X_train, y_train, X_test):
    # Classifier
    classifier = OneVsRestClassifier(clf)
    classifier.fit(X_train, y_train)

    predictions = classifier.predict(X_test)
    return predictions



def eval_f(test_labels, predictions):
    precision = precision_score(test_labels, predictions,
                                average='micro')
    recall = recall_score(test_labels, predictions,
                          average='micro')
    f1 = f1_score(test_labels, predictions, average='micro')

    print("Micro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}"
            .format(precision, recall, f1))

    precision = precision_score(test_labels, predictions,
                                average='macro')
    recall = recall_score(test_labels, predictions,
                          average='macro')
    f1 = f1_score(test_labels, predictions, average='macro')

    print("Macro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}"
            .format(precision, recall, f1))

## Logistic regression

baseline выводим.

In [0]:
C_values = [50, 100, 200, 500]

for c in C_values:
    
    predictions_LR = classifier_f( LogisticRegression( C=c, random_state=42), vectorised_train_ipca, train_labels, vectorised_test_ipca)
    print("C = {0:2.2f}".format(c))
    eval_f(test_labels, predictions_LR)
    print("\n")

KeyboardInterrupt: ignored

## SVM classifiers

SV Classifier sclearn API
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [0]:
%%time

predictions_LinearSVC = classifier_f( LinearSVC(random_state=42), vectorised_train_documents, train_labels, vectorised_test_documents)
eval_f(test_labels, predictions_LinearSVC)

Micro-average quality numbers
Precision: 0.9455, Recall: 0.8013, F1-measure: 0.8674
Macro-average quality numbers
Precision: 0.6493, Recall: 0.3948, F1-measure: 0.4665
CPU times: user 2.84 s, sys: 47.1 ms, total: 2.89 s
Wall time: 2.89 s


In [0]:
%%time

predictions_LinearSVC_ipca = classifier_f( LinearSVC(random_state=42), vectorised_train_ipca, train_labels, vectorised_test_ipca)
eval_f(test_labels, predictions_LinearSVC_ipca)

Micro-average quality numbers
Precision: 0.9479, Recall: 0.8021, F1-measure: 0.8689
Macro-average quality numbers
Precision: 0.6190, Recall: 0.3747, F1-measure: 0.4441
CPU times: user 1min 19s, sys: 5.45 s, total: 1min 25s
Wall time: 1min 23s


In [0]:
C_values = [ 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 50, 100, 200]

for c in (C_values):
    
    predictions_LinearSVC = classifier_f( LinearSVC( C=c, random_state=42), vectorised_train_documents, train_labels, vectorised_test_documents)
    print("C = {0:2.2f}".format(c))
    eval_f(test_labels, predictions_LinearSVC)
    print("\n")

C = 0.05
Micro-average quality numbers
Precision: 0.9766, Recall: 0.6140, F1-measure: 0.7540
Macro-average quality numbers
Precision: 0.2671, Recall: 0.1041, F1-measure: 0.1354


C = 0.10
Micro-average quality numbers
Precision: 0.9680, Recall: 0.6782, F1-measure: 0.7975
Macro-average quality numbers
Precision: 0.3845, Recall: 0.1623, F1-measure: 0.2102


C = 0.50
Micro-average quality numbers
Precision: 0.9542, Recall: 0.7839, F1-measure: 0.8607
Macro-average quality numbers
Precision: 0.5660, Recall: 0.3419, F1-measure: 0.4095


C = 1.00
Micro-average quality numbers
Precision: 0.9455, Recall: 0.8013, F1-measure: 0.8674
Macro-average quality numbers
Precision: 0.6493, Recall: 0.3948, F1-measure: 0.4665


C = 2.00
Micro-average quality numbers
Precision: 0.9397, Recall: 0.8122, F1-measure: 0.8713
Macro-average quality numbers
Precision: 0.6792, Recall: 0.4382, F1-measure: 0.5081


C = 5.00
Micro-average quality numbers
Precision: 0.9350, Recall: 0.8152, F1-measure: 0.8710
Macro-averag

In [0]:
%%time

kernels = [ "rbf", "sigmoid"]  
C_values = [5e03, 1e04, 2e04, 5e04, 1e05, 2e05, 5e05]

for kern in kernels:
    for c in (C_values):
    
        predictions_SVC = classifier_f( SVC( C=c, kernel=kern, random_state=42), vectorised_train_documents, train_labels, vectorised_test_documents)
        print("Kernel: {}, C = {:.2f}".format(kern, c))
        eval_f(test_labels, predictions_SVC)
        print("\n")

Kernel: rbf, C = 5000.00
Micro-average quality numbers
Precision: 0.9577, Recall: 0.7559, F1-measure: 0.8449
Macro-average quality numbers
Precision: 0.4732, Recall: 0.2641, F1-measure: 0.3236


Kernel: rbf, C = 10000.00
Micro-average quality numbers
Precision: 0.9475, Recall: 0.7959, F1-measure: 0.8651
Macro-average quality numbers
Precision: 0.5767, Recall: 0.3703, F1-measure: 0.4337


Kernel: rbf, C = 20000.00
Micro-average quality numbers
Precision: 0.9372, Recall: 0.8173, F1-measure: 0.8732
Macro-average quality numbers
Precision: 0.6716, Recall: 0.4362, F1-measure: 0.5058


Kernel: rbf, C = 50000.00
Micro-average quality numbers
Precision: 0.9298, Recall: 0.8168, F1-measure: 0.8696
Macro-average quality numbers
Precision: 0.6871, Recall: 0.4630, F1-measure: 0.5317


Kernel: rbf, C = 100000.00
Micro-average quality numbers
Precision: 0.9265, Recall: 0.8176, F1-measure: 0.8686
Macro-average quality numbers
Precision: 0.6909, Recall: 0.4632, F1-measure: 0.5306


Kernel: rbf, C = 200

KeyboardInterrupt: ignored

In [0]:
%%time

predictions_SVC_poly = classifier_f( SVC( C=1e04, gamma=0.01, kernel="poly", random_state=42), vectorised_train_documents, train_labels, vectorised_test_documents)
eval_f(test_labels, predictions_SVC_poly)

KeyboardInterrupt: ignored

understanding parameters

https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html

## Naive Bayes classifiers.

In [0]:
%%time

predictions_GaussianNB_ipca = classifier_f( GaussianNB(), vectorised_train_ipca, train_labels, vectorised_test_ipca)

CPU times: user 39.5 s, sys: 952 ms, total: 40.5 s
Wall time: 40.4 s


In [0]:
eval_f(test_labels, predictions_GaussianNB_ipca)

Micro-average quality numbers
Precision: 0.0806, Recall: 0.6862, F1-measure: 0.1443
Macro-average quality numbers
Precision: 0.1839, Recall: 0.4487, F1-measure: 0.1750


In [0]:
%%time

predictions_GaussianNB = classifier_f(  GaussianNB(), vectorised_train_documents_arr, train_labels, vectorised_test_documents_arr)

In [0]:
eval_f(test_labels, predictions_GaussianNB)

In [0]:
%%time

predictions_MultinomialNB = classifier_f( MultinomialNB(), vectorised_train_documents_arr, train_labels, vectorised_test_documents_arr)

In [0]:
eval_f(test_labels, predictions_MultinomialNB)

In [0]:
%%time

predictions_BernoulliNB = classifier_f( BernoulliNB(), vectorised_train_documents_arr, train_labels, vectorised_test_documents_arr)

In [0]:
eval_f(test_labels, predictions_BernoulliNB)

#### run BernoulliNB with reduced by iPCA features

In [0]:
%%time

predictions_BernoulliNB_ipca = classifier_f( BernoulliNB(), vectorised_train_ipca, train_labels, vectorised_test_ipca)

CPU times: user 1min 13s, sys: 16.3 s, total: 1min 29s
Wall time: 1min 3s


In [0]:
eval_f(test_labels, predictions_BernoulliNB_ipca)

Micro-average quality numbers
Precision: 0.8659, Recall: 0.7382, F1-measure: 0.7970
Macro-average quality numbers
Precision: 0.6939, Recall: 0.5241, F1-measure: 0.5727
