**Classification of text documents using sparse features**

This example demonstrates the use of scikit-learn to classify documents by topics using a Bag Of Words approach.

**Loading and vectorizing the 20 newsgroup dataset**

We use The 20 newsgroups text dataset which comprises around 18,000 newsgroups posts on 20 topics split into two subsets: one for training (or development) and the other for testing (or performance evaluation). By default, the text samples contain some message metadata such as 'headers', 'footers' and 'quotes'. The feth_20newsgroups function accepts a parameter named 'remove' to strip such information that can make the classification problem 'too easy'. 

In [1]:
from time import time 

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]


def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs)/1e6

def load_dataset(verbose = False, remove = ()):
    """Loading and vectorizing the 20 newsgroup dataset"""
    
    data_train = fetch_20newsgroups(
        subset = 'train',
        categories = categories, 
        shuffle = True, 
        random_state=42, 
        remove = remove
    )
    
    data_test = fetch_20newsgroups(
        subset = 'test',
        categories = categories, 
        shuffle = True, 
        random_state=42, 
        remove = remove
    )
    
    #order of labels in 'target_names' can be different from categories
    
    target_names = data_train.target_names
    
    #Splitting the target in a training set and a test set
    
    y_train, y_test = data_train.target, data_test.target
    
    #Extracting features from the training data using the sparse vectorizer
    
    t0 = time()
    
    vectorizer = TfidfVectorizer(sublinear_tf = True, max_df = 0.5, min_df = 5, stop_words = 'english')
    
    X_train  = vectorizer.fit_transform(data_train.data)
    
    duration_train = time() - t0
    
    #Extracting features from the test data  using the TFIDFvectorizer
    
    t0 = time()
    
    X_test = vectorizer.fit_transform(data_test.data)
    
    duration_test = time() - t0
    
    feature_names = vectorizer.get_feature_names_out()
    
    if verbose: 
        
        #compute the size of loaded data
        
        data_train_size_mb = size_mb(data_train.data)
        data_test_size_mb = size_mb(data_test.data)
        
        print(
        f'{len(data_train.data)} documents - '
        f'{data_train_size_mb:.2f}MB (training set)'
         )
    
        print(
        f'{len(data_test.data)} documents - '
        f'{data_test_size_mb:.2f}MB (training set)'
        )
    
        print(f'{len(target_names)} categories')

        print(f'vectorize training done in {duration_train:.3f}s'
          f'at {data_train_size_mb / duration_train:.3f} MB/s')
    
        print(f'n_samples: {X_train.shape[0]}, n_features: {X_test.shape[1]}')
    
        print(f'vectorize testing done in {duration_test:.3f}s'
          f'at {data_test_size_mb / duration_test:.3f} MB/s')
    return X_train, X_test, y_train, y_test, feature_names, target_names


    

    
    
    
    
    
    
    
    
    