<a href="https://colab.research.google.com/github/BanafshehHassani/Text-documents-classification-by-sparse-features-plot-sklearn/blob/main/Text_Document_Classification_Banafsheh_Hassani.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Text documents classification by sparse features

Author: [Banafsheh Hassani](https://www.linkedin.com/in/banafsheh-hassani-7b063a129/)

This project uses various classifiers to perform text document classification on the 20 newsgroups dataset. It uses TF-IDF for feature extraction, and calculates various performance metrics for each classifier, including accuracy, dimensionality, density, top 10 keywords per class for some classifiers, classification report, and confusion matrix.

In [None]:
# Necessary libraries are imported
import logging
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.utils.extmath import density

# Logging is set to display information on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

# This project makes use of the 20 newsgroups text dataset, which contains a collection of
# approximately 20,000 newsgroup documents, partitioned across 20 different newsgroups.
data_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), random_state=42)
data_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), random_state=42)

# We create target label arrays for our training and testing sets
y_train, y_test = data_train.target, data_test.target

# The raw text data is transformed into feature vectors using the TF-IDF method
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(data_test.data)

# We create a list of classifiers that will be used to classify the documents
classifiers = [
    (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
    (Perceptron(max_iter=50), "Perceptron"),
    (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"),
    (KNeighborsClassifier(n_neighbors=10), "kNN"),
    (RandomForestClassifier(), "Random forest")
]

# Now we train each classifier and test it
for clf, name in classifiers:
    print('=' * 80)
    print(name)
    results = []

    # We train the classifier
    clf.fit(X_train, y_train)

    # We use the trained classifier to predict the categories of the test set documents
    pred = clf.predict(X_test)

    # Some statistics about the results are printed
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if name in ("Ridge Classifier", "Perceptron","Passive-Aggressive"):
            print("top 10 keywords per class:")
            for i, label in enumerate(data_train.target_names):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print("%s: %s" % (label, " ".join(data_train.target_names[j] for j in top10)))
        print()

    print("classification report:")
    print(metrics.classification_report(y_test, pred, target_names=data_test.target_names))
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    results.append((clf_descr, score))
